diff options
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 10477 |
1 files changed, 102 insertions, 10375 deletions
@@ -31,29 +31,25 @@ * with the POSIX routines of the same names. */ -#ifdef PERL_EXT_RE_BUILD -#include "re_top.h" -#endif - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. * * **** Alterations to Henry's code are... @@ -127,11 +123,16 @@ * access data that we don't want to duplicate. */ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + #include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY #define PERL_IN_REGCOMP_C #include "perl.h" -#define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" EXTERN_C const struct regexp_engine my_reg_engine; @@ -142,1349 +143,7 @@ EXTERN_C const struct regexp_engine wild_reg_engine; #include "invlist_inline.h" #include "unicode_constants.h" - -#ifndef STATIC -#define STATIC static -#endif - -/* this is a chain of data about sub patterns we are processing that - need to be handled separately/specially in study_chunk. Its so - we can simulate recursion without losing state. */ -struct scan_frame; -typedef struct scan_frame { - regnode *last_regnode; /* last node to process in this frame */ - regnode *next_regnode; /* next node to process when last is reached */ - U32 prev_recursed_depth; - I32 stopparen; /* what stopparen do we use */ - bool in_gosub; /* this or an outer frame is for GOSUB */ - - struct scan_frame *this_prev_frame; /* this previous frame */ - struct scan_frame *prev_frame; /* previous frame */ - struct scan_frame *next_frame; /* next frame */ -} scan_frame; - -/* Certain characters are output as a sequence with the first being a - * backslash. */ -#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) - - -struct RExC_state_t { - U32 flags; /* RXf_* are we folding, multilining? */ - U32 pm_flags; /* PMf_* stuff from the calling PMOP */ - char *precomp; /* uncompiled string. */ - char *precomp_end; /* pointer to end of uncompiled string. */ - REGEXP *rx_sv; /* The SV that is the regexp. */ - regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object - pprivate field */ - char *start; /* Start of input for compile */ - char *end; /* End of input for compile */ - char *parse; /* Input-scan pointer. */ - char *copy_start; /* start of copy of input within - constructed parse string */ - char *save_copy_start; /* Provides one level of saving - and restoring 'copy_start' */ - char *copy_start_in_input; /* Position in input string - corresponding to copy_start */ - SSize_t whilem_seen; /* number of WHILEM in this expr */ - regnode *emit_start; /* Start of emitted-code area */ - regnode_offset emit; /* Code-emit pointer */ - I32 naughty; /* How bad is this pattern? */ - I32 sawback; /* Did we see \1, ...? */ - SSize_t size; /* Number of regnode equivalents in - pattern */ - Size_t sets_depth; /* Counts recursion depth of already- - compiled regex set patterns */ - U32 seen; - - I32 parens_buf_size; /* #slots malloced open/close_parens */ - regnode_offset *open_parens; /* offsets to open parens */ - regnode_offset *close_parens; /* offsets to close parens */ - HV *paren_names; /* Paren names */ - - /* position beyond 'precomp' of the warning message furthest away from - * 'precomp'. During the parse, no warnings are raised for any problems - * earlier in the parse than this position. This works if warnings are - * raised the first time a given spot is parsed, and if only one - * independent warning is raised for any given spot */ - Size_t latest_warn_offset; - - I32 npar; /* Capture buffer count so far in the - parse, (OPEN) plus one. ("par" 0 is - the whole pattern)*/ - I32 total_par; /* During initial parse, is either 0, - or -1; the latter indicating a - reparse is needed. After that pass, - it is what 'npar' became after the - pass. Hence, it being > 0 indicates - we are in a reparse situation */ - I32 nestroot; /* root parens we are in - used by - accept */ - I32 seen_zerolen; - regnode *end_op; /* END node in program */ - I32 utf8; /* whether the pattern is utf8 or not */ - I32 orig_utf8; /* whether the pattern was originally in utf8 */ - /* XXX use this for future optimisation of case - * where pattern must be upgraded to utf8. */ - I32 uni_semantics; /* If a d charset modifier should use unicode - rules, even if the pattern is not in - utf8 */ - - I32 recurse_count; /* Number of recurse regops we have generated */ - regnode **recurse; /* Recurse regops */ - U8 *study_chunk_recursed; /* bitmap of which subs we have moved - through */ - U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - I32 in_lookaround; - I32 contains_locale; - I32 override_recoding; - I32 recode_x_to_native; - I32 in_multi_char_class; - int code_index; /* next code_blocks[] slot */ - struct reg_code_blocks *code_blocks;/* positions of literal (?{}) - within pattern */ - SSize_t maxlen; /* mininum possible number of chars in string to match */ - scan_frame *frame_head; - scan_frame *frame_last; - U32 frame_count; - AV *warn_text; - HV *unlexed_names; - SV *runtime_code_qr; /* qr with the runtime code blocks */ -#ifdef DEBUGGING - const char *lastparse; - I32 lastnum; - U32 study_chunk_recursed_count; - AV *paren_name_list; /* idx -> name */ - SV *mysv1; - SV *mysv2; - -#define RExC_lastparse (pRExC_state->lastparse) -#define RExC_lastnum (pRExC_state->lastnum) -#define RExC_paren_name_list (pRExC_state->paren_name_list) -#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) -#define RExC_mysv (pRExC_state->mysv1) -#define RExC_mysv1 (pRExC_state->mysv1) -#define RExC_mysv2 (pRExC_state->mysv2) - -#endif - bool seen_d_op; - bool strict; - bool study_started; - bool in_script_run; - bool use_BRANCHJ; - bool sWARN_EXPERIMENTAL__VLB; - bool sWARN_EXPERIMENTAL__REGEX_SETS; -}; - -#define RExC_flags (pRExC_state->flags) -#define RExC_pm_flags (pRExC_state->pm_flags) -#define RExC_precomp (pRExC_state->precomp) -#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) -#define RExC_copy_start_in_constructed (pRExC_state->copy_start) -#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) -#define RExC_precomp_end (pRExC_state->precomp_end) -#define RExC_rx_sv (pRExC_state->rx_sv) -#define RExC_rx (pRExC_state->rx) -#define RExC_rxi (pRExC_state->rxi) -#define RExC_start (pRExC_state->start) -#define RExC_end (pRExC_state->end) -#define RExC_parse (pRExC_state->parse) -#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) -#define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs - under /d from /u ? */ - -#define RExC_emit (pRExC_state->emit) -#define RExC_emit_start (pRExC_state->emit_start) -#define RExC_sawback (pRExC_state->sawback) -#define RExC_seen (pRExC_state->seen) -#define RExC_size (pRExC_state->size) -#define RExC_maxlen (pRExC_state->maxlen) -#define RExC_npar (pRExC_state->npar) -#define RExC_total_parens (pRExC_state->total_par) -#define RExC_parens_buf_size (pRExC_state->parens_buf_size) -#define RExC_nestroot (pRExC_state->nestroot) -#define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_utf8 (pRExC_state->utf8) -#define RExC_uni_semantics (pRExC_state->uni_semantics) -#define RExC_orig_utf8 (pRExC_state->orig_utf8) -#define RExC_open_parens (pRExC_state->open_parens) -#define RExC_close_parens (pRExC_state->close_parens) -#define RExC_end_op (pRExC_state->end_op) -#define RExC_paren_names (pRExC_state->paren_names) -#define RExC_recurse (pRExC_state->recurse) -#define RExC_recurse_count (pRExC_state->recurse_count) -#define RExC_sets_depth (pRExC_state->sets_depth) -#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) -#define RExC_study_chunk_recursed_bytes \ - (pRExC_state->study_chunk_recursed_bytes) -#define RExC_in_lookaround (pRExC_state->in_lookaround) -#define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) - -#ifdef EBCDIC -# define SET_recode_x_to_native(x) \ - STMT_START { RExC_recode_x_to_native = (x); } STMT_END -#else -# define SET_recode_x_to_native(x) NOOP -#endif - -#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) -#define RExC_frame_head (pRExC_state->frame_head) -#define RExC_frame_last (pRExC_state->frame_last) -#define RExC_frame_count (pRExC_state->frame_count) -#define RExC_strict (pRExC_state->strict) -#define RExC_study_started (pRExC_state->study_started) -#define RExC_warn_text (pRExC_state->warn_text) -#define RExC_in_script_run (pRExC_state->in_script_run) -#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) -#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) -#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) -#define RExC_unlexed_names (pRExC_state->unlexed_names) - - -/***********************************************************************/ -/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse - * - * All of these macros depend on the above RExC_ accessor macros, which - * in turns depend on a variable pRExC_state being in scope where they - * are used. This is the standard regexp parser context variable which is - * passed into every non-trivial parse function in this file. - * - * Note that the UTF macro is itself a wrapper around RExC_utf8, so all - * of the macros which do not take an argument will operate on the - * pRExC_state structure *only*. - * - * Please do NOT modify RExC_parse without using these macros. In the - * future these macros will be extended for enhanced debugging and trace - * output during the parse process. - */ - -/* RExC_parse_incf(flag) - * - * Increment RExC_parse to point at the next codepoint, while doing - * the right thing depending on whether we are parsing UTF-8 strings - * or not. The 'flag' argument determines if content is UTF-8 or not, - * intended for cases where this is NOT governed by the UTF macro. - * - * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. - * - * WARNING: Does NOT take into account RExC_end; it is the callers - * responsibility to make sure there are enough octets left in - * RExC_parse to ensure that when processing UTF-8 we would not read - * past the end of the string. - */ -#define RExC_parse_incf(flag) STMT_START { \ - RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ -} STMT_END - -/* RExC_parse_inc_safef(flag) - * - * Safely increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not and NOT reading past the end of the buffer. - * The 'flag' argument determines if content is UTF-8 or not, - * intended for cases where this is NOT governed by the UTF macro. - * - * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. - * - * NOTE: Will NOT read past RExC_end when content is UTF-8. - */ -#define RExC_parse_inc_safef(flag) STMT_START { \ - RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ -} STMT_END - -/* RExC_parse_inc() - * - * Increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not. - * - * WARNING: Does NOT take into account RExC_end, it is the callers - * responsibility to make sure there are enough octets left in - * RExC_parse to ensure that when processing UTF-8 we would not read - * past the end of the string. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc() RExC_parse_incf(UTF) - -/* RExC_parse_inc_safe() - * - * Safely increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not and NOT reading past the end of the buffer. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) - -/* RExC_parse_inc_utf8() - * - * Increment RExC_parse to point at the next utf8 codepoint, - * assumes content is UTF-8. - * - * WARNING: Does NOT take into account RExC_end; it is the callers - * responsibility to make sure there are enough octets left in RExC_parse - * to ensure that when processing UTF-8 we would not read past the end - * of the string. - */ -#define RExC_parse_inc_utf8() STMT_START { \ - RExC_parse += UTF8SKIP(RExC_parse); \ -} STMT_END - -/* RExC_parse_inc_if_char() - * - * Increment RExC_parse to point at the next codepoint, if and only - * if the current parse point is NOT a NULL, while doing the right thing - * depending on whether we are parsing UTF-8 strings or not. - * - * WARNING: Does NOT take into account RExC_end, it is the callers - * responsibility to make sure there are enough octets left in RExC_parse - * to ensure that when processing UTF-8 we would not read past the end - * of the string. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc_if_char() STMT_START { \ - RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ -} STMT_END - -/* RExC_parse_inc_by(n_octets) - * - * Increment the parse cursor by the number of octets specified by - * the 'n_octets' argument. - * - * NOTE: Does NOT check ANY constraints. It is the callers responsibility - * that this will not move past the end of the string, or leave the - * pointer in the middle of a UTF-8 sequence. - * - * Typically used to advanced past previously analyzed content. - */ -#define RExC_parse_inc_by(n_octets) STMT_START { \ - RExC_parse += (n_octets); \ -} STMT_END - -/* RExC_parse_set(to_ptr) - * - * Sets the RExC_parse pointer to the pointer specified by the 'to' - * argument. No validation whatsoever is performed on the to pointer. - */ -#define RExC_parse_set(to_ptr) STMT_START { \ - RExC_parse = (to_ptr); \ -} STMT_END - -/**********************************************************************/ - -/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set - * a flag to disable back-off on the fixed/floating substrings - if it's - * a high complexity pattern we assume the benefit of avoiding a full match - * is worth the cost of checking for the substrings even if they rarely help. - */ -#define RExC_naughty (pRExC_state->naughty) -#define TOO_NAUGHTY (10) -#define MARK_NAUGHTY(add) \ - if (RExC_naughty < TOO_NAUGHTY) \ - RExC_naughty += (add) -#define MARK_NAUGHTY_EXP(exp, add) \ - if (RExC_naughty < TOO_NAUGHTY) \ - RExC_naughty += RExC_naughty / (exp) + (add) - -#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ - || ((*s) == '{' && regcurly(s, e, NULL))) - -/* - * Flags to be passed up. - */ -#define HASWIDTH 0x01 /* Known to not match null strings, could match - non-null ones. */ -#define SIMPLE 0x02 /* Exactly one character wide */ - /* (or LNBREAK as a special case) */ -#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ -#define TRYAGAIN 0x10 /* Weeded out a declaration. */ -#define RESTART_PARSE 0x20 /* Need to redo the parse */ -#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to - calcuate sizes as UTF-8 */ - -#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) - -/* whether trie related optimizations are enabled */ -#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION -#define TRIE_STUDY_OPT -#define FULL_TRIE_STUDY -#define TRIE_STCLASS -#endif - -/* About the term "restudy" and the var "restudied" and the defines - * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to - * doing multiple study_chunk() calls over the same set of opcodes for* the - * purpose of enhanced TRIE optimizations. - * - * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal - * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled - * which then causes the Perl_re_op_compile() to then call the optimizer - * S_study_chunk() a second time to perform additional optimizations, - * including the aho_corasick startclass optimization. - * This additional pass will only happen once, which is managed by the - * 'restudied' variable in Perl_re_op_compile(). - * - * When this second pass is under way the flags passed into study_chunk() will - * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down - * to any recursive calls to S_study_chunk(). - * - * IMPORTANT: Any logic in study_chunk() that emits warnings should check that - * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may - * be produced twice. - * - * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and - * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. - */ - - -#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] -#define PBITVAL(paren) (1 << ((paren) & 7)) -#define PAREN_OFFSET(depth) \ - (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) -#define PAREN_TEST(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) -#define PAREN_SET(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) -#define PAREN_UNSET(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) - -#define REQUIRE_UTF8(flagp) STMT_START { \ - if (!UTF) { \ - *flagp = RESTART_PARSE|NEED_UTF8; \ - return 0; \ - } \ - } STMT_END - -/* /u is to be chosen if we are supposed to use Unicode rules, or if the - * pattern is in UTF-8. This latter condition is in case the outermost rules - * are locale. See GH #17278 */ -#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) - -/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is - * a flag that indicates we need to override /d with /u as a result of - * something in the pattern. It should only be used in regards to calling - * set_regex_charset() or get_regex_charset() */ -#define REQUIRE_UNI_RULES(flagp, restart_retval) \ - STMT_START { \ - if (DEPENDS_SEMANTICS) { \ - set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ - RExC_uni_semantics = 1; \ - if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ - /* No need to restart the parse if we haven't seen \ - * anything that differs between /u and /d, and no need \ - * to restart immediately if we're going to reparse \ - * anyway to count parens */ \ - *flagp |= RESTART_PARSE; \ - return restart_retval; \ - } \ - } \ - } STMT_END - -#define REQUIRE_BRANCHJ(flagp, restart_retval) \ - STMT_START { \ - RExC_use_BRANCHJ = 1; \ - *flagp |= RESTART_PARSE; \ - return restart_retval; \ - } STMT_END - -/* Until we have completed the parse, we leave RExC_total_parens at 0 or - * less. After that, it must always be positive, because the whole re is - * considered to be surrounded by virtual parens. Setting it to negative - * indicates there is some construct that needs to know the actual number of - * parens to be properly handled. And that means an extra pass will be - * required after we've counted them all */ -#define ALL_PARENS_COUNTED (RExC_total_parens > 0) -#define REQUIRE_PARENS_PASS \ - STMT_START { /* No-op if have completed a pass */ \ - if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ - } STMT_END -#define IN_PARENS_PASS (RExC_total_parens < 0) - - -/* This is used to return failure (zero) early from the calling function if - * various flags in 'flags' are set. Two flags always cause a return: - * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any - * additional flags that should cause a return; 0 if none. If the return will - * be done, '*flagp' is first set to be all of the flags that caused the - * return. */ -#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ - STMT_START { \ - if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ - *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ - return 0; \ - } \ - } STMT_END - -#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) - -#define RETURN_FAIL_ON_RESTART(flags,flagp) \ - RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) -#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ - if (MUST_RESTART(*(flagp))) return 0 - -/* This converts the named class defined in regcomp.h to its equivalent class - * number defined in handy.h. */ -#define namedclass_to_classnum(class) ((int) ((class) / 2)) -#define classnum_to_namedclass(classnum) ((classnum) * 2) - -#define _invlist_union_complement_2nd(a, b, output) \ - _invlist_union_maybe_complement_2nd(a, b, TRUE, output) -#define _invlist_intersection_complement_2nd(a, b, output) \ - _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) - -/* We add a marker if we are deferring expansion of a property that is both - * 1) potentiallly user-defined; and - * 2) could also be an official Unicode property. - * - * Without this marker, any deferred expansion can only be for a user-defined - * one. This marker shouldn't conflict with any that could be in a legal name, - * and is appended to its name to indicate this. There is a string and - * character form */ -#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" -#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' - -/* What is infinity for optimization purposes */ -#define OPTIMIZE_INFTY SSize_t_MAX - -/* About scan_data_t. - - During optimisation we recurse through the regexp program performing - various inplace (keyhole style) optimisations. In addition study_chunk - and scan_commit populate this data structure with information about - what strings MUST appear in the pattern. We look for the longest - string that must appear at a fixed location, and we look for the - longest string that may appear at a floating location. So for instance - in the pattern: - - /FOO[xX]A.*B[xX]BAR/ - - Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating - strings (because they follow a .* construct). study_chunk will identify - both FOO and BAR as being the longest fixed and floating strings respectively. - - The strings can be composites, for instance - - /(f)(o)(o)/ - - will result in a composite fixed substring 'foo'. - - For each string some basic information is maintained: - - - min_offset - This is the position the string must appear at, or not before. - It also implicitly (when combined with minlenp) tells us how many - characters must match before the string we are searching for. - Likewise when combined with minlenp and the length of the string it - tells us how many characters must appear after the string we have - found. - - - max_offset - Only used for floating strings. This is the rightmost point that - the string can appear at. If set to OPTIMIZE_INFTY it indicates that the - string can occur infinitely far to the right. - For fixed strings, it is equal to min_offset. - - - minlenp - A pointer to the minimum number of characters of the pattern that the - string was found inside. This is important as in the case of positive - lookahead or positive lookbehind we can have multiple patterns - involved. Consider - - /(?=FOO).*F/ - - The minimum length of the pattern overall is 3, the minimum length - of the lookahead part is 3, but the minimum length of the part that - will actually match is 1. So 'FOO's minimum length is 3, but the - minimum length for the F is 1. This is important as the minimum length - is used to determine offsets in front of and behind the string being - looked for. Since strings can be composites this is the length of the - pattern at the time it was committed with a scan_commit. Note that - the length is calculated by study_chunk, so that the minimum lengths - are not known until the full pattern has been compiled, thus the - pointer to the value. - - - lookbehind - - In the case of lookbehind the string being searched for can be - offset past the start point of the final matching string. - If this value was just blithely removed from the min_offset it would - invalidate some of the calculations for how many chars must match - before or after (as they are derived from min_offset and minlen and - the length of the string being searched for). - When the final pattern is compiled and the data is moved from the - scan_data_t structure into the regexp structure the information - about lookbehind is factored in, with the information that would - have been lost precalculated in the end_shift field for the - associated string. - - The fields pos_min and pos_delta are used to store the minimum offset - and the delta to the maximum offset at the current point in the pattern. - -*/ - -struct scan_data_substrs { - SV *str; /* longest substring found in pattern */ - SSize_t min_offset; /* earliest point in string it can appear */ - SSize_t max_offset; /* latest point in string it can appear */ - SSize_t *minlenp; /* pointer to the minlen relevant to the string */ - SSize_t lookbehind; /* is the pos of the string modified by LB */ - I32 flags; /* per substring SF_* and SCF_* flags */ -}; - -typedef struct scan_data_t { - /*I32 len_min; unused */ - /*I32 len_delta; unused */ - SSize_t pos_min; - SSize_t pos_delta; - SV *last_found; - SSize_t last_end; /* min value, <0 unless valid. */ - SSize_t last_start_min; - SSize_t last_start_max; - U8 cur_is_floating; /* whether the last_* values should be set as - * the next fixed (0) or floating (1) - * substring */ - - /* [0] is longest fixed substring so far, [1] is longest float so far */ - struct scan_data_substrs substrs[2]; - - I32 flags; /* common SF_* and SCF_* flags */ - I32 whilem_c; - SSize_t *last_closep; - regnode **last_close_opp; /* pointer to pointer to last CLOSE regop - seen. DO NOT DEREFERENCE the regnode - pointer - the op may have been optimized - away */ - regnode_ssc *start_class; -} scan_data_t; - -/* - * Forward declarations for pregcomp()'s friends. - */ - -static const scan_data_t zero_scan_data = { - 0, 0, NULL, 0, 0, 0, 0, - { - { NULL, 0, 0, 0, 0, 0 }, - { NULL, 0, 0, 0, 0, 0 }, - }, - 0, 0, NULL, NULL, NULL -}; - -/* study flags */ - -#define SF_BEFORE_SEOL 0x0001 -#define SF_BEFORE_MEOL 0x0002 -#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) - -#define SF_IS_INF 0x0040 -#define SF_HAS_PAR 0x0080 -#define SF_IN_PAR 0x0100 -#define SF_HAS_EVAL 0x0200 - - -/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the - * longest substring in the pattern. When it is not set the optimiser keeps - * track of position, but does not keep track of the actual strings seen, - * - * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but - * /foo/i will not. - * - * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" - * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be - * turned off because of the alternation (BRANCH). */ -#define SCF_DO_SUBSTR 0x0400 - -#define SCF_DO_STCLASS_AND 0x0800 -#define SCF_DO_STCLASS_OR 0x1000 -#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) -#define SCF_WHILEM_VISITED_POS 0x2000 - -#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? - Search for "restudy" in this file - to find a detailed explanation.*/ -#define SCF_SEEN_ACCEPT 0x8000 -#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? - Search for "restudy" in this file - to find a detailed explanation. */ -#define SCF_IN_DEFINE 0x20000 - - - -#define UTF cBOOL(RExC_utf8) - -/* The enums for all these are ordered so things work out correctly */ -#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ - == REGEX_DEPENDS_CHARSET) -#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ - >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ - == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ - >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ - == REGEX_ASCII_MORE_RESTRICTED_CHARSET) - -#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) - -/* For programs that want to be strictly Unicode compatible by dying if any - * attempt is made to match a non-Unicode code point against a Unicode - * property. */ -#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) - -#define OOB_NAMEDCLASS -1 - -/* There is no code point that is out-of-bounds, so this is problematic. But - * its only current use is to initialize a variable that is always set before - * looked at. */ -#define OOB_UNICODE 0xDEADBEEF - -#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) - - -/* length of regex to show in messages that don't mark a position within */ -#define RegexLengthToShowInErrorMessages 127 - -/* - * If MARKER[12] are adjusted, be sure to adjust the constants at the top - * of t/op/regmesg.t, the tests in t/op/re_tests, and those in - * op/pragma/warn/regcomp. - */ -#define MARKER1 "<-- HERE" /* marker as it appears in the description */ -#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ - -#define REPORT_LOCATION " in regex; marked by " MARKER1 \ - " in m/%" UTF8f MARKER2 "%" UTF8f "/" - -/* The code in this file in places uses one level of recursion with parsing - * rebased to an alternate string constructed by us in memory. This can take - * the form of something that is completely different from the input, or - * something that uses the input as part of the alternate. In the first case, - * there should be no possibility of an error, as we are in complete control of - * the alternate string. But in the second case we don't completely control - * the input portion, so there may be errors in that. Here's an example: - * /[abc\x{DF}def]/ui - * is handled specially because \x{df} folds to a sequence of more than one - * character: 'ss'. What is done is to create and parse an alternate string, - * which looks like this: - * /(?:\x{DF}|[abc\x{DF}def])/ui - * where it uses the input unchanged in the middle of something it constructs, - * which is a branch for the DF outside the character class, and clustering - * parens around the whole thing. (It knows enough to skip the DF inside the - * class while in this substitute parse.) 'abc' and 'def' may have errors that - * need to be reported. The general situation looks like this: - * - * |<------- identical ------>| - * sI tI xI eI - * Input: --------------------------------------------------------------- - * Constructed: --------------------------------------------------- - * sC tC xC eC EC - * |<------- identical ------>| - * - * sI..eI is the portion of the input pattern we are concerned with here. - * sC..EC is the constructed substitute parse string. - * sC..tC is constructed by us - * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. - * In the diagram, these are vertically aligned. - * eC..EC is also constructed by us. - * xC is the position in the substitute parse string where we found a - * problem. - * xI is the position in the original pattern corresponding to xC. - * - * We want to display a message showing the real input string. Thus we need to - * translate from xC to xI. We know that xC >= tC, since the portion of the - * string sC..tC has been constructed by us, and so shouldn't have errors. We - * get: - * xI = tI + (xC - tC) - * - * When the substitute parse is constructed, the code needs to set: - * RExC_start (sC) - * RExC_end (eC) - * RExC_copy_start_in_input (tI) - * RExC_copy_start_in_constructed (tC) - * and restore them when done. - * - * During normal processing of the input pattern, both - * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to - * sI, so that xC equals xI. - */ - -#define sI RExC_precomp -#define eI RExC_precomp_end -#define sC RExC_start -#define eC RExC_end -#define tI RExC_copy_start_in_input -#define tC RExC_copy_start_in_constructed -#define xI(xC) (tI + (xC - tC)) -#define xI_offset(xC) (xI(xC) - sI) - -#define REPORT_LOCATION_ARGS(xC) \ - UTF8fARG(UTF, \ - (xI(xC) > eI) /* Don't run off end */ \ - ? eI - sI /* Length before the <--HERE */ \ - : ((xI_offset(xC) >= 0) \ - ? xI_offset(xC) \ - : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ - IVdf " trying to output message for " \ - " pattern %.*s", \ - __FILE__, __LINE__, (IV) xI_offset(xC), \ - ((int) (eC - sC)), sC), 0)), \ - sI), /* The input pattern printed up to the <--HERE */ \ - UTF8fARG(UTF, \ - (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ - (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ - -/* Used to point after bad bytes for an error message, but avoid skipping - * past a nul byte. */ -#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) - -/* Set up to clean up after our imminent demise */ -#define PREPARE_TO_DIE \ - STMT_START { \ - if (RExC_rx_sv) \ - SAVEFREESV(RExC_rx_sv); \ - if (RExC_open_parens) \ - SAVEFREEPV(RExC_open_parens); \ - if (RExC_close_parens) \ - SAVEFREEPV(RExC_close_parens); \ - } STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given - * arg. Show regex, up to a maximum length. If it's too long, chop and add - * "...". - */ -#define _FAIL(code) STMT_START { \ - const char *ellipses = ""; \ - IV len = RExC_precomp_end - RExC_precomp; \ - \ - PREPARE_TO_DIE; \ - if (len > RegexLengthToShowInErrorMessages) { \ - /* chop 10 shorter than the max, to ensure meaning of "..." */ \ - len = RegexLengthToShowInErrorMessages - 10; \ - ellipses = "..."; \ - } \ - code; \ -} STMT_END - -#define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ - msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -#define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ - arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -#define FAIL3(msg,arg1,arg2) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ - arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -/* - * Simple_vFAIL -- like FAIL, but marks the current location in the scan - */ -#define Simple_vFAIL(m) STMT_START { \ - Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() - */ -#define vFAIL(m) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL(m); \ -} STMT_END - -/* - * Like Simple_vFAIL(), but accepts two arguments. - */ -#define Simple_vFAIL2(m,a1) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). - */ -#define vFAIL2(m,a1) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL2(m, a1); \ -} STMT_END - - -/* - * Like Simple_vFAIL(), but accepts three arguments. - */ -#define Simple_vFAIL3(m, a1, a2) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). - */ -#define vFAIL3(m,a1,a2) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL3(m, a1, a2); \ -} STMT_END - -/* - * Like Simple_vFAIL(), but accepts four arguments. - */ -#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -#define vFAIL4(m,a1,a2,a3) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL4(m, a1, a2, a3); \ -} STMT_END - -/* A specialized version of vFAIL2 that works with UTF8f */ -#define vFAIL2utf8f(m, a1) STMT_START { \ - PREPARE_TO_DIE; \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -#define vFAIL3utf8f(m, a1, a2) STMT_START { \ - PREPARE_TO_DIE; \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* Setting this to NULL is a signal to not output warnings */ -#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ - STMT_START { \ - RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ - RExC_copy_start_in_constructed = NULL; \ - } STMT_END -#define RESTORE_WARNINGS \ - RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed - -/* Since a warning can be generated multiple times as the input is reparsed, we - * output it the first time we come to that point in the parse, but suppress it - * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not - * generate any warnings */ -#define TO_OUTPUT_WARNINGS(loc) \ - ( RExC_copy_start_in_constructed \ - && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) - -/* After we've emitted a warning, we save the position in the input so we don't - * output it again */ -#define UPDATE_WARNINGS_LOC(loc) \ - STMT_START { \ - if (TO_OUTPUT_WARNINGS(loc)) { \ - RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ - - RExC_precomp; \ - } \ - } STMT_END - -/* 'warns' is the output of the packWARNx macro used in 'code' */ -#define _WARN_HELPER(loc, warns, code) \ - STMT_START { \ - if (! RExC_copy_start_in_constructed) { \ - Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ - " expected at '%s'", \ - __FILE__, __LINE__, loc); \ - } \ - if (TO_OUTPUT_WARNINGS(loc)) { \ - if (ckDEAD(warns)) \ - PREPARE_TO_DIE; \ - code; \ - UPDATE_WARNINGS_LOC(loc); \ - } \ - } STMT_END - -/* m is not necessarily a "literal string", in this macro */ -#define warn_non_literal_string(loc, packed_warn, m) \ - _WARN_HELPER(loc, packed_warn, \ - Perl_warner(aTHX_ packed_warn, \ - "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(loc))) -#define reg_warn_non_literal_string(loc, m) \ - warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) - -#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ - STMT_START { \ - char * format; \ - Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ - Newx(format, format_size, char); \ - my_strlcpy(format, m, format_size); \ - my_strlcat(format, REPORT_LOCATION, format_size); \ - SAVEFREEPV(format); \ - _WARN_HELPER(loc, packwarn, \ - Perl_ck_warner(aTHX_ packwarn, \ - format, \ - a1, REPORT_LOCATION_ARGS(loc))); \ - } STMT_END - -#define ckWARNreg(loc,m) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN(loc, m) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) \ - -#define vWARN_dep(loc, m) \ - _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNdep(loc,m) \ - _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNregdep(loc,m) \ - _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ - WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARN2reg_d(loc,m, a1) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(loc))) - -#define ckWARN2reg(loc, m, a1) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(loc))) - -#define vWARN3(loc, m, a1, a2) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(loc))) - -#define ckWARN3reg(loc, m, a1, a2) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN4(loc, m, a1, a2, a3) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARN4reg(loc, m, a1, a2, a3) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN5(loc, m, a1, a2, a3, a4) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, a4, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNexperimental(loc, class, m) \ - STMT_START { \ - if (! RExC_warned_ ## class) { /* warn once per compilation */ \ - RExC_warned_ ## class = 1; \ - _WARN_HELPER(loc, packWARN(class), \ - Perl_ck_warner_d(aTHX_ packWARN(class), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc)));\ - } \ - } STMT_END - -#define ckWARNexperimental_with_arg(loc, class, m, arg) \ - STMT_START { \ - if (! RExC_warned_ ## class) { /* warn once per compilation */ \ - RExC_warned_ ## class = 1; \ - _WARN_HELPER(loc, packWARN(class), \ - Perl_ck_warner_d(aTHX_ packWARN(class), \ - m REPORT_LOCATION, \ - arg, REPORT_LOCATION_ARGS(loc)));\ - } \ - } STMT_END - -/* Convert between a pointer to a node and its offset from the beginning of the - * program */ -#define REGNODE_p(offset) (RExC_emit_start + (offset)) -#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ - (SSize_t) ((node) - RExC_emit_start)) - -#define ProgLen(ri) ri->proglen -#define SetProgLen(ri,x) ri->proglen = x - -#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS -#define EXPERIMENTAL_INPLACESCAN -#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ - -STATIC void -S_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) -{ - PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST; - - /* As the name says. The zeroth bit corresponds to the code point given by - * 'offset' */ - - UV start, end; - - Zero(bitmap, len, U8); - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - assert(start >= offset); - - for (UV i = start; i <= end; i++) { - UV adjusted = i - offset; - - BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted); - } - } - invlist_iterfinish(invlist); -} - -STATIC void -S_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset) -{ - PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP; - - /* As the name says. The zeroth bit corresponds to the code point given by - * 'offset' */ - - Size_t i; - - for (i = 0; i < bitmap_len; i++) { - if (BITMAP_TEST(bitmap, i)) { - int start = i++; - - /* Save a little work by adding a range all at once instead of bit - * by bit */ - while (i < bitmap_len && BITMAP_TEST(bitmap, i)) { - i++; - } - - *invlist = _add_range_to_invlist(*invlist, - start + offset, - i + offset - 1); - } - } -} - -#ifdef DEBUGGING -int -Perl_re_printf(pTHX_ const char *fmt, ...) -{ - va_list ap; - int result; - PerlIO *f= Perl_debug_log; - PERL_ARGS_ASSERT_RE_PRINTF; - va_start(ap, fmt); - result = PerlIO_vprintf(f, fmt, ap); - va_end(ap); - return result; -} - -int -Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) -{ - va_list ap; - int result; - PerlIO *f= Perl_debug_log; - PERL_ARGS_ASSERT_RE_INDENTF; - va_start(ap, depth); - PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); - result = PerlIO_vprintf(f, fmt, ap); - va_end(ap); - return result; -} -#endif /* DEBUGGING */ - -#define DEBUG_RExC_seen() \ - DEBUG_OPTIMISE_MORE_r({ \ - Perl_re_printf( aTHX_ "RExC_seen: "); \ - \ - if (RExC_seen & REG_ZERO_LEN_SEEN) \ - Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ - \ - if (RExC_seen & REG_LOOKBEHIND_SEEN) \ - Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ - \ - if (RExC_seen & REG_GPOS_SEEN) \ - Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ - \ - if (RExC_seen & REG_RECURSE_SEEN) \ - Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ - \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ - Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ - \ - if (RExC_seen & REG_VERBARG_SEEN) \ - Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ - \ - if (RExC_seen & REG_CUTGROUP_SEEN) \ - Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ - \ - if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ - Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ - \ - if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ - Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ - \ - if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ - Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ - \ - Perl_re_printf( aTHX_ "\n"); \ - }); - -#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ - if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) - - -#ifdef DEBUGGING -static void -S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, - const char *close_str) -{ - if (!flags) - return; - - Perl_re_printf( aTHX_ "%s", open_str); - DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); - DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); - DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); - DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); - DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); - DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); - Perl_re_printf( aTHX_ "%s", close_str); -} - - -static void -S_debug_studydata(pTHX_ const char *where, scan_data_t *data, - U32 depth, int is_inf, - SSize_t min, SSize_t stopmin, SSize_t delta) -{ - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - DEBUG_OPTIMISE_MORE_r({ - if (!data) - return; - Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, - depth, - where, - min, stopmin, delta, - (IV)data->pos_min, - (IV)data->pos_delta, - (UV)data->flags - ); - - S_debug_show_study_flags(aTHX_ data->flags," [","]"); - - Perl_re_printf( aTHX_ - " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", - (IV)data->whilem_c, - (IV)(data->last_closep ? *((data)->last_closep) : -1), - is_inf ? "INF " : "" - ); - - if (data->last_found) { - int i; - Perl_re_printf(aTHX_ - "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, - SvPVX_const(data->last_found), - (IV)data->last_end, - (IV)data->last_start_min, - (IV)data->last_start_max - ); - - for (i = 0; i < 2; i++) { - Perl_re_printf(aTHX_ - " %s%s: '%s' @ %" IVdf "/%" IVdf, - data->cur_is_floating == i ? "*" : "", - i ? "Float" : "Fixed", - SvPVX_const(data->substrs[i].str), - (IV)data->substrs[i].min_offset, - (IV)data->substrs[i].max_offset - ); - S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); - } - } - - Perl_re_printf( aTHX_ "\n"); - }); -} - - -static void -S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, - regnode *scan, U32 depth, U32 flags) -{ - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - DEBUG_OPTIMISE_r({ - regnode *Next; - - if (!scan) - return; - Next = regnext(scan); - regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", - depth, - str, - REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), - Next ? (REG_NODE_NUM(Next)) : 0 ); - S_debug_show_study_flags(aTHX_ flags," [ ","]"); - Perl_re_printf( aTHX_ "\n"); - }); -} - - -# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ - S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta) - -# define DEBUG_PEEP(str, scan, depth, flags) \ - S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags) - -#else -# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP -# define DEBUG_PEEP(str, scan, depth, flags) NOOP -#endif - +#include "regcomp_internal.h" /* ========================================================= * BEGIN edit_distance stuff. @@ -1631,5313 +290,16 @@ S_edit_distance(const UV* src, /* END of edit_distance() stuff * ========================================================= */ -/* Mark that we cannot extend a found fixed substring at this point. - Update the longest found anchored substring or the longest found - floating substrings if needed. */ - -STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, - SSize_t *minlenp, int is_inf) -{ - const STRLEN l = CHR_SVLEN(data->last_found); - SV * const longest_sv = data->substrs[data->cur_is_floating].str; - const STRLEN old_l = CHR_SVLEN(longest_sv); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_SCAN_COMMIT; - - if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { - const U8 i = data->cur_is_floating; - SvSetMagicSV(longest_sv, data->last_found); - data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; - - if (!i) /* fixed */ - data->substrs[0].max_offset = data->substrs[0].min_offset; - else { /* float */ - data->substrs[1].max_offset = - (is_inf) - ? OPTIMIZE_INFTY - : (l - ? data->last_start_max - : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min - ? OPTIMIZE_INFTY - : data->pos_min + data->pos_delta)); - } - - data->substrs[i].flags &= ~SF_BEFORE_EOL; - data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; - data->substrs[i].minlenp = minlenp; - data->substrs[i].lookbehind = 0; - } - - SvCUR_set(data->last_found, 0); - { - SV * const sv = data->last_found; - if (SvUTF8(sv) && SvMAGICAL(sv)) { - MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); - if (mg) - mg->mg_len = 0; - } - } - data->last_end = -1; - data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1); -} - -/* An SSC is just a regnode_charclass_posix with an extra field: the inversion - * list that describes which code points it matches */ - -STATIC void -S_ssc_anything(pTHX_ regnode_ssc *ssc) -{ - /* Set the SSC 'ssc' to match an empty string or any code point */ - - PERL_ARGS_ASSERT_SSC_ANYTHING; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* mortalize so won't leak */ - ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); - ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ -} - -STATIC int -S_ssc_is_anything(const regnode_ssc *ssc) -{ - /* Returns TRUE if the SSC 'ssc' can match the empty string and any code - * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys - * us anything: if the function returns TRUE, 'ssc' hasn't been restricted - * in any way, so there's no point in using it */ - - UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ - bool ret; - - PERL_ARGS_ASSERT_SSC_IS_ANYTHING; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { - return FALSE; - } - - /* See if the list consists solely of the range 0 - Infinity */ - invlist_iterinit(ssc->invlist); - ret = invlist_iternext(ssc->invlist, &start, &end) - && start == 0 - && end == UV_MAX; - - invlist_iterfinish(ssc->invlist); - - if (ret) { - return TRUE; - } - - /* If e.g., both \w and \W are set, matches everything */ - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - int i; - for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { - if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { - return TRUE; - } - } - } - - return FALSE; -} - -STATIC void -S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) -{ - /* Initializes the SSC 'ssc'. This includes setting it to match an empty - * string, any code point, or any posix class under locale */ - - PERL_ARGS_ASSERT_SSC_INIT; - - Zero(ssc, 1, regnode_ssc); - set_ANYOF_SYNTHETIC(ssc); - ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); - ssc_anything(ssc); - - /* If any portion of the regex is to operate under locale rules that aren't - * fully known at compile time, initialization includes it. The reason - * this isn't done for all regexes is that the optimizer was written under - * the assumption that locale was all-or-nothing. Given the complexity and - * lack of documentation in the optimizer, and that there are inadequate - * test cases for locale, many parts of it may not work properly, it is - * safest to avoid locale unless necessary. */ - if (RExC_contains_locale) { - ANYOF_POSIXL_SETALL(ssc); - } - else { - ANYOF_POSIXL_ZERO(ssc); - } -} - -STATIC int -S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, - const regnode_ssc *ssc) -{ - /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only - * to the list of code points matched, and locale posix classes; hence does - * not check its flags) */ - - UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ - bool ret; - - PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - invlist_iterinit(ssc->invlist); - ret = invlist_iternext(ssc->invlist, &start, &end) - && start == 0 - && end == UV_MAX; - - invlist_iterfinish(ssc->invlist); - - if (! ret) { - return FALSE; - } - - if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { - return FALSE; - } - - return TRUE; -} - -#define INVLIST_INDEX 0 -#define ONLY_LOCALE_MATCHES_INDEX 1 -#define DEFERRED_USER_DEFINED_INDEX 2 - -STATIC SV* -S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass* const node) -{ - /* Returns a mortal inversion list defining which code points are matched - * by 'node', which is of ANYOF-ish type . Handles complementing the - * result if appropriate. If some code points aren't knowable at this - * time, the returned list must, and will, contain every code point that is - * a possibility. */ - - SV* invlist = NULL; - SV* only_utf8_locale_invlist = NULL; - bool new_node_has_latin1 = FALSE; - const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF) - ? ANYOF_FLAGS(node) - : 0; - - PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; - - /* Look at the data structure created by S_set_ANYOF_arg() */ - if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { - invlist = sv_2mortal(_new_invlist(1)); - invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); - } - else if (ANYOF_HAS_AUX(node)) { - const U32 n = ARG(node); - SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); - AV * const av = MUTABLE_AV(SvRV(rv)); - SV **const ary = AvARRAY(av); - - if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { - - /* Here there are things that won't be known until runtime -- we - * have to assume it could be anything */ - invlist = sv_2mortal(_new_invlist(1)); - return _add_range_to_invlist(invlist, 0, UV_MAX); - } - else if (ary[INVLIST_INDEX]) { - - /* Use the node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); - } - - /* Get the code points valid only under UTF-8 locales */ - if ( (flags & ANYOFL_FOLD) - && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) - { - only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; - } - } - - if (! invlist) { - invlist = sv_2mortal(_new_invlist(0)); - } - - /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS - * code points, and an inversion list for the others, but if there are code - * points that should match only conditionally on the target string being - * UTF-8, those are placed in the inversion list, and not the bitmap. - * Since there are circumstances under which they could match, they are - * included in the SSC. But if the ANYOF node is to be inverted, we have - * to exclude them here, so that when we invert below, the end result - * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We - * have to do this here before we add the unconditionally matched code - * points */ - if (flags & ANYOF_INVERT) { - _invlist_intersection_complement_2nd(invlist, - PL_UpperLatin1, - &invlist); - } - - /* Add in the points from the bit map */ - if (REGNODE_TYPE(OP(node)) == ANYOF){ - for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (ANYOF_BITMAP_TEST(node, i)) { - unsigned int start = i++; - - for (; i < NUM_ANYOF_CODE_POINTS - && ANYOF_BITMAP_TEST(node, i); ++i) - { - /* empty */ - } - invlist = _add_range_to_invlist(invlist, start, i-1); - new_node_has_latin1 = TRUE; - } - } - } - - /* If this can match all upper Latin1 code points, have to add them - * as well. But don't add them if inverting, as when that gets done below, - * it would exclude all these characters, including the ones it shouldn't - * that were added just above */ - if ( ! (flags & ANYOF_INVERT) - && OP(node) == ANYOFD - && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) - { - _invlist_union(invlist, PL_UpperLatin1, &invlist); - } - - /* Similarly for these */ - if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { - _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); - } - - if (flags & ANYOF_INVERT) { - _invlist_invert(invlist); - } - else if (flags & ANYOFL_FOLD) { - if (new_node_has_latin1) { - - /* These folds are potential in Turkic locales */ - if (_invlist_contains_cp(invlist, 'i')) { - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - if (_invlist_contains_cp(invlist, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - } - - /* Under /li, any 0-255 could fold to any other 0-255, depending on - * the locale. We can skip this if there are no 0-255 at all. */ - _invlist_union(invlist, PL_Latin1, &invlist); - } - else { - if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { - invlist = add_cp_to_invlist(invlist, 'I'); - } - if (_invlist_contains_cp(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) - { - invlist = add_cp_to_invlist(invlist, 'i'); - } - } - } - - /* Similarly add the UTF-8 locale possible matches. These have to be - * deferred until after the non-UTF-8 locale ones are taken care of just - * above, or it leads to wrong results under ANYOF_INVERT */ - if (only_utf8_locale_invlist) { - _invlist_union_maybe_complement_2nd(invlist, - only_utf8_locale_invlist, - flags & ANYOF_INVERT, - &invlist); - } - - return invlist; -} - -/* These two functions currently do the exact same thing */ -#define ssc_init_zero ssc_init - -#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) -#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) - -/* 'AND' a given class with another one. Can create false positives. 'ssc' - * should not be inverted. */ - -STATIC void -S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_charclass *and_with) -{ - /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either - * another SSC or a regular ANYOF class. Can create false positives. */ - - SV* anded_cp_list; - U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF) - ? ANYOF_FLAGS(and_with) - : 0; - U8 anded_flags; - - PERL_ARGS_ASSERT_SSC_AND; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract - * the code point inversion list and just the relevant flags */ - if (is_ANYOF_SYNTHETIC(and_with)) { - anded_cp_list = ((regnode_ssc *)and_with)->invlist; - anded_flags = and_with_flags; - - /* XXX This is a kludge around what appears to be deficiencies in the - * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, - * there are paths through the optimizer where it doesn't get weeded - * out when it should. And if we don't make some extra provision for - * it like the code just below, it doesn't get added when it should. - * This solution is to add it only when AND'ing, which is here, and - * only when what is being AND'ed is the pristine, original node - * matching anything. Thus it is like adding it to ssc_anything() but - * only when the result is to be AND'ed. Probably the same solution - * could be adopted for the same problem we have with /l matching, - * which is solved differently in S_ssc_init(), and that would lead to - * fewer false positives than that solution has. But if this solution - * creates bugs, the consequences are only that a warning isn't raised - * that should be; while the consequences for having /l bugs is - * incorrect matches */ - if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER__shared; - } - } - else { - anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - if (OP(and_with) == ANYOFD) { - anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; - } - else { - anded_flags = and_with_flags - & ( ANYOF_COMMON_FLAGS - |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); - if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) { - anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; - } - } - } - - ANYOF_FLAGS(ssc) &= anded_flags; - - /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. - * C2 is the list of code points in 'and-with'; P2, its posix classes. - * 'and_with' may be inverted. When not inverted, we have the situation of - * computing: - * (C1 | P1) & (C2 | P2) - * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) - * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) - * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) - * <= ((C1 & C2) | P1 | P2) - * Alternatively, the last few steps could be: - * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) - * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) - * <= (C1 | C2 | (P1 & P2)) - * We favor the second approach if either P1 or P2 is non-empty. This is - * because these components are a barrier to doing optimizations, as what - * they match cannot be known until the moment of matching as they are - * dependent on the current locale, 'AND"ing them likely will reduce or - * eliminate them. - * But we can do better if we know that C1,P1 are in their initial state (a - * frequent occurrence), each matching everything: - * (<everything>) & (C2 | P2) = C2 | P2 - * Similarly, if C2,P2 are in their initial state (again a frequent - * occurrence), the result is a no-op - * (C1 | P1) & (<everything>) = C1 | P1 - * - * Inverted, we have - * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) - * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) - * <= (C1 & ~C2) | (P1 & ~P2) - * */ - - if ((and_with_flags & ANYOF_INVERT) - && ! is_ANYOF_SYNTHETIC(and_with)) - { - unsigned int i; - - ssc_intersection(ssc, - anded_cp_list, - FALSE /* Has already been inverted */ - ); - - /* If either P1 or P2 is empty, the intersection will be also; can skip - * the loop */ - if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { - ANYOF_POSIXL_ZERO(ssc); - } - else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - - /* Note that the Posix class component P from 'and_with' actually - * looks like: - * P = Pa | Pb | ... | Pn - * where each component is one posix class, such as in [\w\s]. - * Thus - * ~P = ~(Pa | Pb | ... | Pn) - * = ~Pa & ~Pb & ... & ~Pn - * <= ~Pa | ~Pb | ... | ~Pn - * The last is something we can easily calculate, but unfortunately - * is likely to have many false positives. We could do better - * in some (but certainly not all) instances if two classes in - * P have known relationships. For example - * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: - * So - * :lower: & :print: = :lower: - * And similarly for classes that must be disjoint. For example, - * since \s and \w can have no elements in common based on rules in - * the POSIX standard, - * \w & ^\S = nothing - * Unfortunately, some vendor locales do not meet the Posix - * standard, in particular almost everything by Microsoft. - * The loop below just changes e.g., \w into \W and vice versa */ - - regnode_charclass_posixl temp; - int add = 1; /* To calculate the index of the complement */ - - Zero(&temp, 1, regnode_charclass_posixl); - ANYOF_POSIXL_ZERO(&temp); - for (i = 0; i < ANYOF_MAX; i++) { - assert(i % 2 != 0 - || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) - || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); - - if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { - ANYOF_POSIXL_SET(&temp, i + add); - } - add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ - } - ANYOF_POSIXL_AND(&temp, ssc); - - } /* else ssc already has no posixes */ - } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC - in its initial state */ - else if (! is_ANYOF_SYNTHETIC(and_with) - || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) - { - /* But if 'ssc' is in its initial state, the result is just 'and_with'; - * copy it over 'ssc' */ - if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { - if (is_ANYOF_SYNTHETIC(and_with)) { - StructCopy(and_with, ssc, regnode_ssc); - } - else { - ssc->invlist = anded_cp_list; - ANYOF_POSIXL_ZERO(ssc); - if (and_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); - } - } - } - else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) - || (and_with_flags & ANYOF_MATCHES_POSIXL)) - { - /* One or the other of P1, P2 is non-empty. */ - if (and_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); - } - ssc_union(ssc, anded_cp_list, FALSE); - } - else { /* P1 = P2 = empty */ - ssc_intersection(ssc, anded_cp_list, FALSE); - } - } -} - -STATIC void -S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_charclass *or_with) -{ - /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either - * another SSC or a regular ANYOF class. Can create false positives if - * 'or_with' is to be inverted. */ - - SV* ored_cp_list; - U8 ored_flags; - U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF) - ? ANYOF_FLAGS(or_with) - : 0; - - PERL_ARGS_ASSERT_SSC_OR; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract - * the code point inversion list and just the relevant flags */ - if (is_ANYOF_SYNTHETIC(or_with)) { - ored_cp_list = ((regnode_ssc*) or_with)->invlist; - ored_flags = or_with_flags; - } - else { - ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); - ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; - if (OP(or_with) != ANYOFD) { - ored_flags |= - or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); - if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) { - ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; - } - } - } - - ANYOF_FLAGS(ssc) |= ored_flags; - - /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. - * C2 is the list of code points in 'or-with'; P2, its posix classes. - * 'or_with' may be inverted. When not inverted, we have the simple - * situation of computing: - * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) - * If P1|P2 yields a situation with both a class and its complement are - * set, like having both \w and \W, this matches all code points, and we - * can delete these from the P component of the ssc going forward. XXX We - * might be able to delete all the P components, but I (khw) am not certain - * about this, and it is better to be safe. - * - * Inverted, we have - * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) - * <= (C1 | P1) | ~C2 - * <= (C1 | ~C2) | P1 - * (which results in actually simpler code than the non-inverted case) - * */ - - if ((or_with_flags & ANYOF_INVERT) - && ! is_ANYOF_SYNTHETIC(or_with)) - { - /* We ignore P2, leaving P1 going forward */ - } /* else Not inverted */ - else if (or_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - unsigned int i; - for (i = 0; i < ANYOF_MAX; i += 2) { - if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) - { - ssc_match_all_cp(ssc); - ANYOF_POSIXL_CLEAR(ssc, i); - ANYOF_POSIXL_CLEAR(ssc, i+1); - } - } - } - } - - ssc_union(ssc, - ored_cp_list, - FALSE /* Already has been inverted */ - ); -} - -STATIC void -S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) -{ - PERL_ARGS_ASSERT_SSC_UNION; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - _invlist_union_maybe_complement_2nd(ssc->invlist, - invlist, - invert2nd, - &ssc->invlist); -} - -STATIC void -S_ssc_intersection(pTHX_ regnode_ssc *ssc, - SV* const invlist, - const bool invert2nd) -{ - PERL_ARGS_ASSERT_SSC_INTERSECTION; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - _invlist_intersection_maybe_complement_2nd(ssc->invlist, - invlist, - invert2nd, - &ssc->invlist); -} - -STATIC void -S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) -{ - PERL_ARGS_ASSERT_SSC_ADD_RANGE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); -} - -STATIC void -S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) -{ - /* AND just the single code point 'cp' into the SSC 'ssc' */ - - SV* cp_list = _new_invlist(2); - - PERL_ARGS_ASSERT_SSC_CP_AND; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - cp_list = add_cp_to_invlist(cp_list, cp); - ssc_intersection(ssc, cp_list, - FALSE /* Not inverted */ - ); - SvREFCNT_dec_NN(cp_list); -} - -STATIC void -S_ssc_clear_locale(regnode_ssc *ssc) -{ - /* Set the SSC 'ssc' to not match any locale things */ - PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - ANYOF_POSIXL_ZERO(ssc); - ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; -} - -STATIC bool -S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) -{ - /* The synthetic start class is used to hopefully quickly winnow down - * places where a pattern could start a match in the target string. If it - * doesn't really narrow things down that much, there isn't much point to - * having the overhead of using it. This function uses some very crude - * heuristics to decide if to use the ssc or not. - * - * It returns TRUE if 'ssc' rules out more than half what it considers to - * be the "likely" possible matches, but of course it doesn't know what the - * actual things being matched are going to be; these are only guesses - * - * For /l matches, it assumes that the only likely matches are going to be - * in the 0-255 range, uniformly distributed, so half of that is 127 - * For /a and /d matches, it assumes that the likely matches will be just - * the ASCII range, so half of that is 63 - * For /u and there isn't anything matching above the Latin1 range, it - * assumes that that is the only range likely to be matched, and uses - * half that as the cut-off: 127. If anything matches above Latin1, - * it assumes that all of Unicode could match (uniformly), except for - * non-Unicode code points and things in the General Category "Other" - * (unassigned, private use, surrogates, controls and formats). This - * is a much large number. */ - - U32 count = 0; /* Running total of number of code points matched by - 'ssc' */ - UV start, end; /* Start and end points of current range in inversion - XXX outdated. UTF-8 locales are common, what about invert? list */ - const U32 max_code_points = (LOC) - ? 256 - : (( ! UNI_SEMANTICS - || invlist_highest(ssc->invlist) < 256) - ? 128 - : NON_OTHER_COUNT); - const U32 max_match = max_code_points / 2; - - PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; - - invlist_iterinit(ssc->invlist); - while (invlist_iternext(ssc->invlist, &start, &end)) { - if (start >= max_code_points) { - break; - } - end = MIN(end, max_code_points - 1); - count += end - start + 1; - if (count >= max_match) { - invlist_iterfinish(ssc->invlist); - return FALSE; - } - } - - return TRUE; -} - - -STATIC void -S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) -{ - /* The inversion list in the SSC is marked mortal; now we need a more - * permanent copy, which is stored the same way that is done in a regular - * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit - * map */ - - SV* invlist = invlist_clone(ssc->invlist, NULL); - - PERL_ARGS_ASSERT_SSC_FINALIZE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* The code in this file assumes that all but these flags aren't relevant - * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared - * by the time we reach here */ - assert(! (ANYOF_FLAGS(ssc) - & ~( ANYOF_COMMON_FLAGS - |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES))); - - populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist); - - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); - SvREFCNT_dec(invlist); - - /* Make sure is clone-safe */ - ssc->invlist = NULL; - - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; - OP(ssc) = ANYOFPOSIXL; - } - else if (RExC_contains_locale) { - OP(ssc) = ANYOFL; - } - - assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); -} - -#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] -#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) -#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ - ? (TRIE_LIST_CUR( idx ) - 1) \ - : 0 ) - - -#ifdef DEBUGGING -/* - dump_trie(trie,widecharmap,revcharmap) - dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) - dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) - - These routines dump out a trie in a somewhat readable format. - The _interim_ variants are used for debugging the interim - tables that are used to generate the final compressed - representation which is what dump_trie expects. - - Part of the reason for their existence is to provide a form - of documentation as to how the different representations function. - -*/ - -/* - Dumps the final compressed table form of the trie to Perl_debug_log. - Used for debugging make_trie(). -*/ - -STATIC void -S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, - AV *revcharmap, U32 depth) -{ - U32 state; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - U16 word; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE; - - Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", - depth+1, "Match","Base","Ofs" ); - - for( state = 0 ; state < trie->uniquecharcount ; state++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, state, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - } - } - Perl_re_printf( aTHX_ "\n"); - Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); - - for( state = 0 ; state < trie->uniquecharcount ; state++ ) - Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); - Perl_re_printf( aTHX_ "\n"); - - for( state = 1 ; state < trie->statecount ; state++ ) { - const U32 base = trie->states[ state ].trans.base; - - Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); - - if ( trie->states[ state ].wordnum ) { - Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); - } else { - Perl_re_printf( aTHX_ "%6s", "" ); - } - - Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); - - if ( base ) { - U32 ofs = 0; - - while( ( base + ofs < trie->uniquecharcount ) || - ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check - != state)) - ofs++; - - Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); - - for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) - && ( base + ofs - trie->uniquecharcount - < trie->lasttrans ) - && trie->trans[ base + ofs - - trie->uniquecharcount ].check == state ) - { - Perl_re_printf( aTHX_ "%*" UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next - ); - } else { - Perl_re_printf( aTHX_ "%*s", colwidth," ." ); - } - } - - Perl_re_printf( aTHX_ "]"); - - } - Perl_re_printf( aTHX_ "\n" ); - } - Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", - depth); - for (word=1; word <= trie->wordcount; word++) { - Perl_re_printf( aTHX_ " %d:(%d,%d)", - (int)word, (int)(trie->wordinfo[word].prev), - (int)(trie->wordinfo[word].len)); - } - Perl_re_printf( aTHX_ "\n" ); -} -/* - Dumps a fully constructed but uncompressed trie in list form. - List tries normally only are used for construction when the number of - possible chars (trie->uniquecharcount) is very high. - Used for debugging make_trie(). -*/ -STATIC void -S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, - HV *widecharmap, AV *revcharmap, U32 next_alloc, - U32 depth) -{ - U32 state; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; - - /* print out the table precompression. */ - Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", - depth+1 ); - Perl_re_indentf( aTHX_ "%s", - depth+1, "------:-----+-----------------\n" ); - - for( state=1 ; state < next_alloc ; state ++ ) { - U16 charid; - - Perl_re_indentf( aTHX_ " %4" UVXf " :", - depth+1, (UV)state ); - if ( ! trie->states[ state ].wordnum ) { - Perl_re_printf( aTHX_ "%5s| ",""); - } else { - Perl_re_printf( aTHX_ "W%4x| ", - trie->states[ state ].wordnum - ); - } - for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, - TRIE_LIST_ITEM(state, charid).forid, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), - colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) - | PERL_PV_ESCAPE_FIRSTCHAR - ) , - TRIE_LIST_ITEM(state, charid).forid, - (UV)TRIE_LIST_ITEM(state, charid).newstate - ); - if (!(charid % 10)) - Perl_re_printf( aTHX_ "\n%*s| ", - (int)((depth * 2) + 14), ""); - } - } - Perl_re_printf( aTHX_ "\n"); - } -} - -/* - Dumps a fully constructed but uncompressed trie in table form. - This is the normal DFA style state transition table, with a few - twists to facilitate compression later. - Used for debugging make_trie(). -*/ -STATIC void -S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, - HV *widecharmap, AV *revcharmap, U32 next_alloc, - U32 depth) -{ - U32 state; - U16 charid; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; - - /* - print out the table precompression so that we can do a visual check - that they are identical. - */ - - Perl_re_indentf( aTHX_ "Char : ", depth+1 ); - - for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, charid, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - } - } - - Perl_re_printf( aTHX_ "\n"); - Perl_re_indentf( aTHX_ "State+-", depth+1 ); - - for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { - Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); - } - - Perl_re_printf( aTHX_ "\n" ); - - for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - - Perl_re_indentf( aTHX_ "%4" UVXf " : ", - depth+1, - (UV)TRIE_NODENUM( state ) ); - - for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { - UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); - if (v) - Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); - else - Perl_re_printf( aTHX_ "%*s", colwidth, "." ); - } - if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - Perl_re_printf( aTHX_ " (%4" UVXf ")\n", - (UV)trie->trans[ state ].check ); - } else { - Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", - (UV)trie->trans[ state ].check, - trie->states[ TRIE_NODENUM( state ) ].wordnum ); - } - } -} - -#endif - - -/* make_trie(startbranch,first,last,tail,word_count,flags,depth) - startbranch: the first branch in the whole branch sequence - first : start branch of sequence of branch-exact nodes. - May be the same as startbranch - last : Thing following the last branch. - May be the same as tail. - tail : item following the branch sequence - count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ - depth : indent depth - -Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. - -A trie is an N'ary tree where the branches are determined by digital -decomposition of the key. IE, at the root node you look up the 1st character and -follow that branch repeat until you find the end of the branches. Nodes can be -marked as "accepting" meaning they represent a complete word. Eg: - - /he|she|his|hers/ - -would convert into the following structure. Numbers represent states, letters -following numbers represent valid transitions on the letter from that state, if -the number is in square brackets it represents an accepting state, otherwise it -will be in parenthesis. - - +-h->+-e->[3]-+-r->(8)-+-s->[9] - | | - | (2) - | | - (1) +-i->(6)-+-s->[7] - | - +-s->(3)-+-h->(4)-+-e->[5] - - Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) - -This shows that when matching against the string 'hers' we will begin at state 1 -read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, -then read 'r' and go to state 8 followed by 's' which takes us to state 9 which -is also accepting. Thus we know that we can match both 'he' and 'hers' with a -single traverse. We store a mapping from accepting to state to which word was -matched, and then when we have multiple possibilities we try to complete the -rest of the regex in the order in which they occurred in the alternation. - -The only prior NFA like behaviour that would be changed by the TRIE support is -the silent ignoring of duplicate alternations which are of the form: - - / (DUPE|DUPE) X? (?{ ... }) Y /x - -Thus EVAL blocks following a trie may be called a different number of times with -and without the optimisation. With the optimisations dupes will be silently -ignored. This inconsistent behaviour of EVAL type nodes is well established as -the following demonstrates: - - 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ - -which prints out 'word' three times, but - - 'words'=~/(word|word|word)(?{ print $1 })S/ - -which doesnt print it out at all. This is due to other optimisations kicking in. - -Example of what happens on a structural level: - -The regexp /(ac|ad|ab)+/ will produce the following debug output: - - 1: CURLYM[1] {1,32767}(18) - 5: BRANCH(8) - 6: EXACT <ac>(16) - 8: BRANCH(11) - 9: EXACT <ad>(16) - 11: BRANCH(14) - 12: EXACT <ab>(16) - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -This would be optimizable with startbranch=5, first=5, last=16, tail=16 -and should turn into: - - 1: CURLYM[1] {1,32767}(18) - 5: TRIE(16) - [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] - <ac> - <ad> - <ab> - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -Cases where tail != last would be like /(?foo|bar)baz/: - - 1: BRANCH(4) - 2: EXACT <foo>(8) - 4: BRANCH(7) - 5: EXACT <bar>(8) - 7: TAIL(8) - 8: EXACT <baz>(10) - 10: END(0) - -which would be optimizable with startbranch=1, first=1, last=7, tail=8 -and would end up looking like: - - 1: TRIE(8) - [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] - <foo> - <bar> - 7: TAIL(8) - 8: EXACT <baz>(10) - 10: END(0) - - d = uvchr_to_utf8_flags(d, uv, 0); - -is the recommended Unicode-aware way of saying - - *(d++) = uv; -*/ - -#define TRIE_STORE_REVCHAR(val) \ - STMT_START { \ - if (UTF) { \ - SV *zlopp = newSV(UTF8_MAXBYTES); \ - unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ - *kapow = '\0'; \ - SvCUR_set(zlopp, kapow - flrbbbbb); \ - SvPOK_on(zlopp); \ - SvUTF8_on(zlopp); \ - av_push_simple(revcharmap, zlopp); \ - } else { \ - char ooooff = (char)val; \ - av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \ - } \ - } STMT_END - -/* This gets the next character from the input, folding it if not already - * folded. */ -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need \ - * folding */ \ - uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* This folder implies Unicode rules, which in the range expressible \ - * by not UTF is the lower case, with the two exceptions, one of \ - * which should have been taken care of before calling this */ \ - assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ - uvc = toLOWER_L1(*uc); \ - if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ - len = 1; \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ -} STMT_END - - - -#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ - if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ - U32 ging = TRIE_LIST_LEN( state ) * 2; \ - Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ - TRIE_LIST_LEN( state ) = ging; \ - } \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ - TRIE_LIST_CUR( state )++; \ -} STMT_END - -#define TRIE_LIST_NEW(state) STMT_START { \ - Newx( trie->states[ state ].trans.list, \ - 4, reg_trie_trans_le ); \ - TRIE_LIST_CUR( state ) = 1; \ - TRIE_LIST_LEN( state ) = 4; \ -} STMT_END - -#define TRIE_HANDLE_WORD(state) STMT_START { \ - U16 dupe= trie->states[ state ].wordnum; \ - regnode * const noper_next = regnext( noper ); \ - \ - DEBUG_r({ \ - /* store the word for dumping */ \ - SV* tmp; \ - if (OP(noper) != NOTHING) \ - tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ - else \ - tmp = newSVpvn_utf8( "", 0, UTF ); \ - av_push_simple( trie_words, tmp ); \ - }); \ - \ - curword++; \ - trie->wordinfo[curword].prev = 0; \ - trie->wordinfo[curword].len = wordlen; \ - trie->wordinfo[curword].accept = state; \ - \ - if ( noper_next < tail ) { \ - if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ - sizeof(U16) ); \ - trie->jump[curword] = (U16)(noper_next - convert); \ - if (!jumper) \ - jumper = noper_next; \ - if (!nextbranch) \ - nextbranch= regnext(cur); \ - } \ - \ - if ( dupe ) { \ - /* It's a dupe. Pre-insert into the wordinfo[].prev */\ - /* chain, so that when the bits of chain are later */\ - /* linked together, the dups appear in the chain */\ - trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ - trie->wordinfo[dupe].prev = curword; \ - } else { \ - /* we haven't inserted this word yet. */ \ - trie->states[ state ].wordnum = curword; \ - } \ -} STMT_END - - -#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ - ( ( base + charid >= ucharcount \ - && base + charid < ubound \ - && state == trie->trans[ base - ucharcount + charid ].check \ - && trie->trans[ base - ucharcount + charid ].next ) \ - ? trie->trans[ base - ucharcount + charid ].next \ - : ( state==1 ? special : 0 ) \ - ) - -#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ -STMT_START { \ - TRIE_BITMAP_SET(trie, uvc); \ - /* store the folded codepoint */ \ - if ( folder ) \ - TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ - \ - if ( !UTF ) { \ - /* store first byte of utf8 representation of */ \ - /* variant codepoints */ \ - if (! UVCHR_IS_INVARIANT(uvc)) { \ - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ - } \ - } \ -} STMT_END -#define MADE_TRIE 1 -#define MADE_JUMP_TRIE 2 -#define MADE_EXACT_TRIE 4 - -STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, - regnode *first, regnode *last, regnode *tail, - U32 word_count, U32 flags, U32 depth) -{ - /* first pass, loop through and scan words */ - reg_trie_data *trie; - HV *widecharmap = NULL; - AV *revcharmap = newAV(); - regnode *cur; - STRLEN len = 0; - UV uvc = 0; - U16 curword = 0; - U32 next_alloc = 0; - regnode *jumper = NULL; - regnode *nextbranch = NULL; - regnode *convert = NULL; - U32 *prev_states; /* temp array mapping each state to previous one */ - /* we just use folder as a flag in utf8 */ - const U8 * folder = NULL; - - /* in the below add_data call we are storing either 'tu' or 'tuaa' - * which stands for one trie structure, one hash, optionally followed - * by two arrays */ -#ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa")); - AV *trie_words = NULL; - /* along with revcharmap, this only used during construction but both are - * useful during debugging so we store them in the struct when debugging. - */ -#else - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); - STRLEN trie_charcount=0; -#endif - SV *re_trie_maxbuff; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_MAKE_TRIE; -#ifndef DEBUGGING - PERL_UNUSED_ARG(depth); -#endif - - switch (flags) { - case EXACT: case EXACT_REQ8: case EXACTL: break; - case EXACTFAA: - case EXACTFUP: - case EXACTFU: - case EXACTFLU8: folder = PL_fold_latin1; break; - case EXACTF: folder = PL_fold; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); - } - - trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); - trie->refcount = 1; - trie->startstate = 1; - trie->wordcount = word_count; - RExC_rxi->data->data[ data_slot ] = (void*)trie; - trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) - trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); - trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( - trie->wordcount+1, sizeof(reg_trie_wordinfo)); - - DEBUG_r({ - trie_words = newAV(); - }); - - re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); - assert(re_trie_maxbuff); - if (!SvIOK(re_trie_maxbuff)) { - sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); - } - DEBUG_TRIE_COMPILE_r({ - Perl_re_indentf( aTHX_ - "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - depth+1, - REG_NODE_NUM(startbranch), REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); - }); - - /* Find the node we are going to overwrite */ - if ( first == startbranch && OP( last ) != BRANCH ) { - /* whole branch chain */ - convert = first; - } else { - /* branch sub-chain */ - convert = REGNODE_AFTER( first ); - } - - /* -- First loop and Setup -- - - We first traverse the branches and scan each word to determine if it - contains widechars, and how many unique chars there are, this is - important as we have to build a table with at least as many columns as we - have unique chars. - - We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use - the native representation of the character value as the key and IV's for - the coded index. - - *TODO* If we keep track of how many times each character is used we can - remap the columns so that the table compression later on is more - efficient in terms of memory by ensuring the most common value is in the - middle and the least common are on the outside. IMO this would be better - than a most to least common mapping as theres a decent chance the most - common letter will share a node with the least common, meaning the node - will not be compressible. With a middle is most common approach the worst - case is when we have the least common nodes twice. - - */ - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode *noper = REGNODE_AFTER( cur ); - const U8 *uc; - const U8 *e; - int foldlen = 0; - U32 wordlen = 0; /* required init */ - STRLEN minchars = 0; - STRLEN maxchars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the - bitmap?*/ - - if (OP(noper) == NOTHING) { - /* skip past a NOTHING at the start of an alternation - * eg, /(?:)a|(?:b)/ should be the same as /a|b/ - * - * If the next node is not something we are supposed to process - * we will just ignore it due to the condition guarding the - * next block. - */ - - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } else { - trie->minlen= 0; - continue; - } - - - if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ - TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte - regardless of encoding */ - if (OP( noper ) == EXACTFUP) { - /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); - } - } - - for ( ; uc < e ; uc += len ) { /* Look at each char in the current - branch */ - TRIE_CHARCOUNT(trie)++; - TRIE_READ_CHAR; - - /* TRIE_READ_CHAR returns the current character, or its fold if /i - * is in effect. Under /i, this character can match itself, or - * anything that folds to it. If not under /i, it can match just - * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN - * all fold to k, and all are single characters. But some folds - * expand to more than one character, so for example LATIN SMALL - * LIGATURE FFI folds to the three character sequence 'ffi'. If - * the string beginning at 'uc' is 'ffi', it could be matched by - * three characters, or just by the one ligature character. (It - * could also be matched by two characters: LATIN SMALL LIGATURE FF - * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). - * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also - * match.) The trie needs to know the minimum and maximum number - * of characters that could match so that it can use size alone to - * quickly reject many match attempts. The max is simple: it is - * the number of folded characters in this branch (since a fold is - * never shorter than what folds to it. */ - - maxchars++; - - /* And the min is equal to the max if not under /i (indicated by - * 'folder' being NULL), or there are no multi-character folds. If - * there is a multi-character fold, the min is incremented just - * once, for the character that folds to the sequence. Each - * character in the sequence needs to be added to the list below of - * characters in the trie, but we count only the first towards the - * min number of characters needed. This is done through the - * variable 'foldlen', which is returned by the macros that look - * for these sequences as the number of bytes the sequence - * occupies. Each time through the loop, we decrement 'foldlen' by - * how many bytes the current char occupies. Only when it reaches - * 0 do we increment 'minchars' or look for another multi-character - * sequence. */ - if (folder == NULL) { - minchars++; - } - else if (foldlen > 0) { - foldlen -= (UTF) ? UTF8SKIP(uc) : 1; - } - else { - minchars++; - - /* See if *uc is the beginning of a multi-character fold. If - * so, we decrement the length remaining to look at, to account - * for the current character this iteration. (We can use 'uc' - * instead of the fold returned by TRIE_READ_CHAR because the - * macro is smart enough to account for any unfolded - * characters. */ - if (UTF) { - if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { - foldlen -= UTF8SKIP(uc); - } - } - else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { - foldlen--; - } - } - - /* The current character (and any potential folds) should be added - * to the possible matching characters for this position in this - * branch */ - if ( uvc < 256 ) { - if ( folder ) { - U8 folded= folder[ (U8) uvc ]; - if ( !trie->charmap[ folded ] ) { - trie->charmap[ folded ]=( ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR( folded ); - } - } - if ( !trie->charmap[ uvc ] ) { - trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR( uvc ); - } - if ( set_bit ) { - /* store the codepoint in the bitmap, and its folded - * equivalent. */ - TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); - set_bit = 0; /* We've done our bit :-) */ - } - } else { - - /* XXX We could come up with the list of code points that fold - * to this using PL_utf8_foldclosures, except not for - * multi-char folds, as there may be multiple combinations - * there that could work, which needs to wait until runtime to - * resolve (The comment about LIGATURE FFI above is such an - * example */ - - SV** svpp; - if ( !widecharmap ) - widecharmap = newHV(); - - svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); - - if ( !svpp ) - Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); - - if ( !SvTRUE( *svpp ) ) { - sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR(uvc); - } - } - } /* end loop through characters in this branch of the trie */ - - /* We take the min and max for this branch and combine to find the min - * and max for all branches processed so far */ - if( cur == first ) { - trie->minlen = minchars; - trie->maxlen = maxchars; - } else if (minchars < trie->minlen) { - trie->minlen = minchars; - } else if (maxchars > trie->maxlen) { - trie->maxlen = maxchars; - } - } /* end first pass */ - DEBUG_TRIE_COMPILE_r( - Perl_re_indentf( aTHX_ - "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", - depth+1, - ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, - (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, - (int)trie->minlen, (int)trie->maxlen ) - ); - - /* - We now know what we are dealing with in terms of unique chars and - string sizes so we can calculate how much memory a naive - representation using a flat table will take. If it's over a reasonable - limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory - conservative but potentially much slower representation using an array - of lists. - - At the end we convert both representations into the same compressed - form that will be used in regexec.c for matching with. The latter - is a form that cannot be used to construct with but has memory - properties similar to the list form and access properties similar - to the table form making it both suitable for fast searches and - small enough that its feasable to store for the duration of a program. - - See the comment in the code where the compressed table is produced - inplace from the flat tabe representation for an explanation of how - the compression works. - - */ - - - Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); - prev_states[1] = 0; - - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) - > SvIV(re_trie_maxbuff) ) - { - /* - Second Pass -- Array Of Lists Representation - - Each state will be represented by a list of charid:state records - (reg_trie_trans_le) the first such element holds the CUR and LEN - points of the allocated array. (See defines above). - - We build the initial structure using the lists, and then convert - it into the compressed table form which allows faster lookups - (but cant be modified once converted). - */ - - STRLEN transcount = 1; - - DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", - depth+1)); - - trie->states = (reg_trie_state *) - PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); - TRIE_LIST_NEW(1); - next_alloc = 2; - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - - regnode *noper = REGNODE_AFTER( cur ); - U32 state = 1; /* required init */ - U16 charid = 0; /* sanity init */ - U32 wordlen = 0; /* required init */ - - if (OP(noper) == NOTHING) { - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - /* we will undo this assignment if noper does not - * point at a trieable type in the else clause of - * the following statement. */ - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - const U8 *uc= (U8*)STRING(noper); - const U8 *e= uc + STR_LEN(noper); - - for ( ; uc < e ; uc += len ) { - - TRIE_READ_CHAR; - - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV** const svpp = hv_fetch( widecharmap, - (char*)&uvc, - sizeof( UV ), - 0); - if ( !svpp ) { - charid = 0; - } else { - charid=(U16)SvIV( *svpp ); - } - } - /* charid is now 0 if we dont know the char read, or - * nonzero if we do */ - if ( charid ) { - - U16 check; - U32 newstate = 0; - - charid--; - if ( !trie->states[ state ].trans.list ) { - TRIE_LIST_NEW( state ); - } - for ( check = 1; - check <= TRIE_LIST_USED( state ); - check++ ) - { - if ( TRIE_LIST_ITEM( state, check ).forid - == charid ) - { - newstate = TRIE_LIST_ITEM( state, check ).newstate; - break; - } - } - if ( ! newstate ) { - newstate = next_alloc++; - prev_states[newstate] = state; - TRIE_LIST_PUSH( state, charid, newstate ); - transcount++; - } - state = newstate; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); - } - } - } else { - /* If we end up here it is because we skipped past a NOTHING, but did not end up - * on a trieable type. So we need to reset noper back to point at the first regop - * in the branch before we call TRIE_HANDLE_WORD() - */ - noper= REGNODE_AFTER(cur); - } - TRIE_HANDLE_WORD(state); - - } /* end second pass */ - - /* next alloc is the NEXT state to be allocated */ - trie->statecount = next_alloc; - trie->states = (reg_trie_state *) - PerlMemShared_realloc( trie->states, - next_alloc - * sizeof(reg_trie_state) ); - - /* and now dump it out before we compress it */ - DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, - revcharmap, next_alloc, - depth+1) - ); - - trie->trans = (reg_trie_trans *) - PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); - { - U32 state; - U32 tp = 0; - U32 zp = 0; - - - for( state=1 ; state < next_alloc ; state ++ ) { - U32 base=0; - - /* - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) - ); - */ - - if (trie->states[state].trans.list) { - U16 minid=TRIE_LIST_ITEM( state, 1).forid; - U16 maxid=minid; - U16 idx; - - for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U16 forid = TRIE_LIST_ITEM( state, idx).forid; - if ( forid < minid ) { - minid=forid; - } else if ( forid > maxid ) { - maxid=forid; - } - } - if ( transcount < tp + maxid - minid + 1) { - transcount *= 2; - trie->trans = (reg_trie_trans *) - PerlMemShared_realloc( trie->trans, - transcount - * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), - transcount / 2, - reg_trie_trans ); - } - base = trie->uniquecharcount + tp - minid; - if ( maxid == minid ) { - U32 set = 0; - for ( ; zp < tp ; zp++ ) { - if ( ! trie->trans[ zp ].next ) { - base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, - 1).newstate; - trie->trans[ zp ].check = state; - set = 1; - break; - } - } - if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, - 1).newstate; - trie->trans[ tp ].check = state; - tp++; - zp = tp; - } - } else { - for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - - trie->uniquecharcount - + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, - idx ).newstate; - trie->trans[ tid ].check = state; - } - tp += ( maxid - minid + 1 ); - } - Safefree(trie->states[ state ].trans.list); - } - /* - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_printf( aTHX_ " base: %d\n",base); - ); - */ - trie->states[ state ].trans.base=base; - } - trie->lasttrans = tp + 1; - } - } else { - /* - Second Pass -- Flat Table Representation. - - we dont use the 0 slot of either trans[] or states[] so we add 1 to - each. We know that we will need Charcount+1 trans at most to store - the data (one row per char at worst case) So we preallocate both - structures assuming worst case. - - We then construct the trie using only the .next slots of the entry - structs. - - We use the .check field of the first entry of the node temporarily - to make compression both faster and easier by keeping track of how - many non zero fields are in the node. - - Since trans are numbered from 1 any 0 pointer in the table is a FAIL - transition. - - There are two terms at use here: state as a TRIE_NODEIDX() which is - a number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) - and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) - if there are 2 entrys per node. eg: - - A B A B - 1. 2 4 1. 3 7 - 2. 0 3 3. 0 5 - 3. 0 0 5. 0 0 - 4. 0 0 7. 0 0 - - The table is internally in the right hand, idx form. However as we - also have to deal with the states array which is indexed by nodenum - we have to use TRIE_NODENUM() to convert. - - */ - DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", - depth+1)); - - trie->trans = (reg_trie_trans *) - PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) - * trie->uniquecharcount + 1, - sizeof(reg_trie_trans) ); - trie->states = (reg_trie_state *) - PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); - next_alloc = trie->uniquecharcount + 1; - - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - - regnode *noper = REGNODE_AFTER( cur ); - - U32 state = 1; /* required init */ - - U16 charid = 0; /* sanity init */ - U32 accept_state = 0; /* sanity init */ - - U32 wordlen = 0; /* required init */ - - if (OP(noper) == NOTHING) { - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - /* we will undo this assignment if noper does not - * point at a trieable type in the else clause of - * the following statement. */ - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - const U8 *uc= (U8*)STRING(noper); - const U8 *e= uc + STR_LEN(noper); - - for ( ; uc < e ; uc += len ) { - - TRIE_READ_CHAR; - - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV* const * const svpp = hv_fetch( widecharmap, - (char*)&uvc, - sizeof( UV ), - 0); - charid = svpp ? (U16)SvIV(*svpp) : 0; - } - if ( charid ) { - charid--; - if ( !trie->trans[ state + charid ].next ) { - trie->trans[ state + charid ].next = next_alloc; - trie->trans[ state ].check++; - prev_states[TRIE_NODENUM(next_alloc)] - = TRIE_NODENUM(state); - next_alloc += trie->uniquecharcount; - } - state = trie->trans[ state + charid ].next; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); - } - /* charid is now 0 if we dont know the char read, or - * nonzero if we do */ - } - } else { - /* If we end up here it is because we skipped past a NOTHING, but did not end up - * on a trieable type. So we need to reset noper back to point at the first regop - * in the branch before we call TRIE_HANDLE_WORD(). - */ - noper= REGNODE_AFTER(cur); - } - accept_state = TRIE_NODENUM( state ); - TRIE_HANDLE_WORD(accept_state); - - } /* end second pass */ - - /* and now dump it out before we compress it */ - DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, - revcharmap, - next_alloc, depth+1)); - - { - /* - * Inplace compress the table.* - - For sparse data sets the table constructed by the trie algorithm will - be mostly 0/FAIL transitions or to put it another way mostly empty. - (Note that leaf nodes will not contain any transitions.) - - This algorithm compresses the tables by eliminating most such - transitions, at the cost of a modest bit of extra work during lookup: - - - Each states[] entry contains a .base field which indicates the - index in the state[] array wheres its transition data is stored. - - - If .base is 0 there are no valid transitions from that node. - - - If .base is nonzero then charid is added to it to find an entry in - the trans array. - - -If trans[states[state].base+charid].check!=state then the - transition is taken to be a 0/Fail transition. Thus if there are fail - transitions at the front of the node then the .base offset will point - somewhere inside the previous nodes data (or maybe even into a node - even earlier), but the .check field determines if the transition is - valid. - - XXX - wrong maybe? - The following process inplace converts the table to the compressed - table: We first do not compress the root node 1,and mark all its - .check pointers as 1 and set its .base pointer as 1 as well. This - allows us to do a DFA construction from the compressed table later, - and ensures that any .base pointers we calculate later are greater - than 0. - - - We set 'pos' to indicate the first entry of the second node. - - - We then iterate over the columns of the node, finding the first and - last used entry at l and m. We then copy l..m into pos..(pos+m-l), - and set the .check pointers accordingly, and advance pos - appropriately and repreat for the next node. Note that when we copy - the next pointers we have to convert them from the original - NODEIDX form to NODENUM form as the former is not valid post - compression. - - - If a node has no transitions used we mark its base as 0 and do not - advance the pos pointer. - - - If a node only has one transition we use a second pointer into the - structure to fill in allocated fail transitions from other states. - This pointer is independent of the main pointer and scans forward - looking for null transitions that are allocated to a state. When it - finds one it writes the single transition into the "hole". If the - pointer doesnt find one the single transition is appended as normal. - - - Once compressed we can Renew/realloc the structures to release the - excess space. - - See "Table-Compression Methods" in sec 3.9 of the Red Dragon, - specifically Fig 3.47 and the associated pseudocode. - - demq - */ - const U32 laststate = TRIE_NODENUM( next_alloc ); - U32 state, charid; - U32 pos = 0, zp=0; - trie->statecount = laststate; - - for ( state = 1 ; state < laststate ; state++ ) { - U8 flag = 0; - const U32 stateidx = TRIE_NODEIDX( state ); - const U32 o_used = trie->trans[ stateidx ].check; - U32 used = trie->trans[ stateidx ].check; - trie->trans[ stateidx ].check = 0; - - for ( charid = 0; - used && charid < trie->uniquecharcount; - charid++ ) - { - if ( flag || trie->trans[ stateidx + charid ].next ) { - if ( trie->trans[ stateidx + charid ].next ) { - if (o_used == 1) { - for ( ; zp < pos ; zp++ ) { - if ( ! trie->trans[ zp ].next ) { - break; - } - } - trie->states[ state ].trans.base - = zp - + trie->uniquecharcount - - charid ; - trie->trans[ zp ].next - = SAFE_TRIE_NODENUM( trie->trans[ stateidx - + charid ].next ); - trie->trans[ zp ].check = state; - if ( ++zp > pos ) pos = zp; - break; - } - used--; - } - if ( !flag ) { - flag = 1; - trie->states[ state ].trans.base - = pos + trie->uniquecharcount - charid ; - } - trie->trans[ pos ].next - = SAFE_TRIE_NODENUM( - trie->trans[ stateidx + charid ].next ); - trie->trans[ pos ].check = state; - pos++; - } - } - } - trie->lasttrans = pos + 1; - trie->states = (reg_trie_state *) - PerlMemShared_realloc( trie->states, laststate - * sizeof(reg_trie_state) ); - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", - depth+1, - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount - + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); - ); - - } /* end table compress */ - } - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", - depth+1, - (UV)trie->statecount, - (UV)trie->lasttrans) - ); - /* resize the trans array to remove unused space */ - trie->trans = (reg_trie_trans *) - PerlMemShared_realloc( trie->trans, trie->lasttrans - * sizeof(reg_trie_trans) ); - - { /* Modify the program and insert the new TRIE node */ - U8 nodetype =(U8) flags; - char *str=NULL; - -#ifdef DEBUGGING - regnode *optimize = NULL; -#endif /* DEBUGGING */ - /* - This means we convert either the first branch or the first Exact, - depending on whether the thing following (in 'last') is a branch - or not and whther first is the startbranch (ie is it a sub part of - the alternation or is it the whole thing.) - Assuming its a sub part we convert the EXACT otherwise we convert - the whole branch sequence, including the first. - */ - /* Find the node we are going to overwrite */ - if ( first != startbranch || OP( last ) == BRANCH ) { - /* branch sub-chain */ - NEXT_OFF( first ) = (U16)(last - first); - /* whole branch chain */ - } - /* But first we check to see if there is a common prefix we can - split out as an EXACT and put in front of the TRIE node. */ - trie->startstate= 1; - if ( trie->bitmap && !widecharmap && !trie->jump ) { - /* we want to find the first state that has more than - * one transition, if that state is not the first state - * then we have a common prefix which we can remove. - */ - U32 state; - for ( state = 1 ; state < trie->statecount-1 ; state++ ) { - U32 ofs = 0; - I32 first_ofs = -1; /* keeps track of the ofs of the first - transition, -1 means none */ - U32 count = 0; - const U32 base = trie->states[ state ].trans.base; - - /* does this state terminate an alternation? */ - if ( trie->states[state].wordnum ) - count = 1; - - for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) - { - if ( ++count > 1 ) { - /* we have more than one transition */ - SV **tmp; - U8 *ch; - /* if this is the first state there is no common prefix - * to extract, so we can exit */ - if ( state == 1 ) break; - tmp = av_fetch_simple( revcharmap, ofs, 0); - ch = (U8*)SvPV_nolen_const( *tmp ); - - /* if we are on count 2 then we need to initialize the - * bitmap, and store the previous char if there was one - * in it*/ - if ( count == 2 ) { - /* clear the bitmap */ - Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); - DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", - depth+1, - (UV)state)); - if (first_ofs >= 0) { - SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0); - const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); - - TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); - DEBUG_OPTIMISE_r( - Perl_re_printf( aTHX_ "%s", (char*)ch) - ); - } - } - /* store the current firstchar in the bitmap */ - TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); - } - first_ofs = ofs; - } - } - if ( count == 1 ) { - /* This state has only one transition, its transition is part - * of a common prefix - we need to concatenate the char it - * represents to what we have so far. */ - SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0); - STRLEN len; - char *ch = SvPV( *tmp, len ); - DEBUG_OPTIMISE_r({ - SV *sv=sv_newmortal(); - Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", - depth+1, - (UV)state, (UV)first_ofs, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - }); - if ( state==1 ) { - OP( convert ) = nodetype; - str=STRING(convert); - setSTR_LEN(convert, 0); - } - assert( ( STR_LEN(convert) + len ) < 256 ); - setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); - while (len--) - *str++ = *ch++; - } else { -#ifdef DEBUGGING - if (state>1) - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); -#endif - break; - } - } - trie->prefixlen = (state-1); - if (str) { - regnode *n = REGNODE_AFTER(convert); - assert( n - convert <= U16_MAX ); - NEXT_OFF(convert) = n - convert; - trie->startstate = state; - trie->minlen -= (state - 1); - trie->maxlen -= (state - 1); -#ifdef DEBUGGING - /* At least the UNICOS C compiler choked on this - * being argument to DEBUG_r(), so let's just have - * it right here. */ - if ( -#ifdef PERL_EXT_RE_BUILD - 1 -#else - DEBUG_r_TEST -#endif - ) { - U32 word = trie->wordcount; - while (word--) { - SV ** const tmp = av_fetch_simple( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - } -#endif - if (trie->maxlen) { - convert = n; - } else { - NEXT_OFF(convert) = (U16)(tail - convert); - DEBUG_r(optimize= n); - } - } - } - if (!jumper) - jumper = last; - if ( trie->maxlen ) { - NEXT_OFF( convert ) = (U16)(tail - convert); - ARG_SET( convert, data_slot ); - /* Store the offset to the first unabsorbed branch in - jump[0], which is otherwise unused by the jump logic. - We use this when dumping a trie and during optimisation. */ - if (trie->jump) - trie->jump[0] = (U16)(nextbranch - convert); - - /* If the start state is not accepting (meaning there is no empty string/NOTHING) - * and there is a bitmap - * and the first "jump target" node we found leaves enough room - * then convert the TRIE node into a TRIEC node, with the bitmap - * embedded inline in the opcode - this is hypothetically faster. - */ - if ( !trie->states[trie->startstate].wordnum - && trie->bitmap - && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) - { - OP( convert ) = TRIEC; - Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); - PerlMemShared_free(trie->bitmap); - trie->bitmap= NULL; - } else - OP( convert ) = TRIE; - - /* store the type in the flags */ - convert->flags = nodetype; - DEBUG_r({ - optimize = convert - + NODE_STEP_REGNODE - + REGNODE_ARG_LEN( OP( convert ) ); - }); - /* XXX We really should free up the resource in trie now, - as we won't use them - (which resources?) dmq */ - } - /* needed for dumping*/ - DEBUG_r(if (optimize) { - /* - Try to clean up some of the debris left after the - optimisation. - */ - while( optimize < jumper ) { - OP( optimize ) = OPTIMIZED; - optimize++; - } - }); - } /* end node insert */ - - /* Finish populating the prev field of the wordinfo array. Walk back - * from each accept state until we find another accept state, and if - * so, point the first word's .prev field at the second word. If the - * second already has a .prev field set, stop now. This will be the - * case either if we've already processed that word's accept state, - * or that state had multiple words, and the overspill words were - * already linked up earlier. - */ - { - U16 word; - U32 state; - U16 prev; - - for (word=1; word <= trie->wordcount; word++) { - prev = 0; - if (trie->wordinfo[word].prev) - continue; - state = trie->wordinfo[word].accept; - while (state) { - state = prev_states[state]; - if (!state) - break; - prev = trie->states[state].wordnum; - if (prev) - break; - } - trie->wordinfo[word].prev = prev; - } - Safefree(prev_states); - } - - - /* and now dump out the compressed format */ - DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); - - RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; -#ifdef DEBUGGING - RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; - RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; -#else - SvREFCNT_dec_NN(revcharmap); -#endif - return trie->jump - ? MADE_JUMP_TRIE - : trie->startstate>1 - ? MADE_EXACT_TRIE - : MADE_TRIE; -} - -STATIC regnode * -S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) -{ -/* The Trie is constructed and compressed now so we can build a fail array if - * it's needed - - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and - 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, - Ullman 1985/88 - ISBN 0-201-10088-6 - - We find the fail state for each state in the trie, this state is the longest - proper suffix of the current state's 'word' that is also a proper prefix of - another word in our trie. State 1 represents the word '' and is thus the - default fail state. This allows the DFA not to have to restart after its - tried and failed a word at a given point, it simply continues as though it - had been matching the other word in the first place. - Consider - 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter - 'g' which would fail, which would bring us to the state representing 'd' in - the second word where we would try 'g' and succeed, proceeding to match - 'cdgu'. - */ - /* add a fail transition */ - const U32 trie_offset = ARG(source); - reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; - U32 *q; - const U32 ucharcount = trie->uniquecharcount; - const U32 numstates = trie->statecount; - const U32 ubound = trie->lasttrans + ucharcount; - U32 q_read = 0; - U32 q_write = 0; - U32 charid; - U32 base = trie->states[ 1 ].trans.base; - U32 *fail; - reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); - regnode *stclass; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; - PERL_UNUSED_CONTEXT; -#ifndef DEBUGGING - PERL_UNUSED_ARG(depth); -#endif - - if ( OP(source) == TRIE ) { - struct regnode_1 *op = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(source, op, struct regnode_1); - stclass = (regnode *)op; - } else { - struct regnode_charclass *op = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(source, op, struct regnode_charclass); - stclass = (regnode *)op; - } - OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ - - ARG_SET( stclass, data_slot ); - aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); - RExC_rxi->data->data[ data_slot ] = (void*)aho; - aho->trie=trie_offset; - aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); - Copy( trie->states, aho->states, numstates, reg_trie_state ); - Newx( q, numstates, U32); - aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); - aho->refcount = 1; - fail = aho->fail; - /* initialize fail[0..1] to be 1 so that we always have - a valid final fail state */ - fail[ 0 ] = fail[ 1 ] = 1; - - for ( charid = 0; charid < ucharcount ; charid++ ) { - const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); - if ( newstate ) { - q[ q_write ] = newstate; - /* set to point at the root */ - fail[ q[ q_write++ ] ]=1; - } - } - while ( q_read < q_write) { - const U32 cur = q[ q_read++ % numstates ]; - base = trie->states[ cur ].trans.base; - - for ( charid = 0 ; charid < ucharcount ; charid++ ) { - const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); - if (ch_state) { - U32 fail_state = cur; - U32 fail_base; - do { - fail_state = fail[ fail_state ]; - fail_base = aho->states[ fail_state ].trans.base; - } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); - - fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); - fail[ ch_state ] = fail_state; - if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) - { - aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; - } - q[ q_write++ % numstates] = ch_state; - } - } - } - /* restore fail[0..1] to 0 so that we "fall out" of the AC loop - when we fail in state 1, this allows us to use the - charclass scan to find a valid start char. This is based on the principle - that theres a good chance the string being searched contains lots of stuff - that cant be a start char. - */ - fail[ 0 ] = fail[ 1 ] = 0; - DEBUG_TRIE_COMPILE_r({ - Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", - depth, (UV)numstates - ); - for( q_read=1; q_read<numstates; q_read++ ) { - Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); - } - Perl_re_printf( aTHX_ "\n"); - }); - Safefree(q); - /*RExC_seen |= REG_TRIEDFA_SEEN;*/ - return stclass; -} - - -/* The below joins as many adjacent EXACTish nodes as possible into a single - * one. The regop may be changed if the node(s) contain certain sequences that - * require special handling. The joining is only done if: - * 1) there is room in the current conglomerated node to entirely contain the - * next one. - * 2) they are compatible node types - * - * The adjacent nodes actually may be separated by NOTHING-kind nodes, and - * these get optimized out - * - * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full - * as possible, even if that means splitting an existing node so that its first - * part is moved to the preceding node. This would maximise the efficiency of - * memEQ during matching. - * - * If a node is to match under /i (folded), the number of characters it matches - * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta number of characters of the - * input nodes. - * - * And *unfolded_multi_char is set to indicate whether or not the node contains - * an unfolded multi-char fold. This happens when it won't be known until - * runtime whether the fold is valid or not; namely - * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the - * target string being matched against turns out to be UTF-8 is that fold - * valid; or - * 2) for EXACTFL nodes whose folding rules depend on the locale in force at - * runtime. - * (Multi-char folds whose components are all above the Latin1 range are not - * run-time locale dependent, and have already been folded by the time this - * function is called.) - * - * This is as good a place as any to discuss the design of handling these - * multi-character fold sequences. It's been wrong in Perl for a very long - * time. There are three code points in Unicode whose multi-character folds - * were long ago discovered to mess things up. The previous designs for - * dealing with these involved assigning a special node for them. This - * approach doesn't always work, as evidenced by this example: - * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both sides fold to "sss", but if the pattern is parsed to create a node that - * would match just the \xDF, it won't be able to handle the case where a - * successful match would have to cross the node's boundary. The new approach - * that hopefully generally solves the problem generates an EXACTFUP node - * that is "sss" in this case. - * - * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases. The - * approach taken is: - * 1) This routine examines each EXACTFish node that could contain multi- - * character folded sequences. Since a single character can fold into - * such a sequence, the minimum match length for this node is less than - * the number of characters in the node. This routine returns in - * *min_subtract how many characters to subtract from the actual - * length of the string to get a real minimum match length; it is 0 if - * there are no multi-char foldeds. This delta is used by the caller to - * adjust the min length of the match, and the delta between min and max, - * so that the optimizer doesn't reject these possibilities based on size - * constraints. - * - * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) - * under /u, we fold it to 'ss' in regatom(), and in this routine, after - * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 - * EXACTFU nodes. The node type of such nodes is then changed to - * EXACTFUP, indicating it is problematic, and needs careful handling. - * (The procedures in step 1) above are sufficient to handle this case in - * UTF-8 encoded nodes.) The reason this is problematic is that this is - * the only case where there is a possible fold length change in non-UTF-8 - * patterns. By reserving a special node type for problematic cases, the - * far more common regular EXACTFU nodes can be processed faster. - * regexec.c takes advantage of this. - * - * EXACTFUP has been created as a grab-bag for (hopefully uncommon) - * problematic cases. These all only occur when the pattern is not - * UTF-8. In addition to the 'ss' sequence where there is a possible fold - * length change, it handles the situation where the string cannot be - * entirely folded. The strings in an EXACTFish node are folded as much - * as possible during compilation in regcomp.c. This saves effort in - * regex matching. By using an EXACTFUP node when it is not possible to - * fully fold at compile time, regexec.c can know that everything in an - * EXACTFU node is folded, so folding can be skipped at runtime. The only - * case where folding in EXACTFU nodes can't be done at compile time is - * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This - * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes - * handle two very different cases. Alternatively, there could have been - * a node type where there are length changes, one for unfolded, and one - * for both. If yet another special case needed to be created, the number - * of required node types would have to go to 7. khw figures that even - * though there are plenty of node types to spare, that the maintenance - * cost wasn't worth the small speedup of doing it that way, especially - * since he thinks the MICRO SIGN is rarely encountered in practice. - * - * There are other cases where folding isn't done at compile time, but - * none of them are under /u, and hence not for EXACTFU nodes. The folds - * in EXACTFL nodes aren't known until runtime, and vary as the locale - * changes. Some folds in EXACTF depend on if the runtime target string - * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di - * when no fold in it depends on the UTF-8ness of the target string.) - * - * 3) A problem remains for unfolded multi-char folds. (These occur when the - * validity of the fold won't be known until runtime, and so must remain - * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA - * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot - * be an EXACTF node with a UTF-8 pattern.) They also occur for various - * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) - * The reason this is a problem is that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes an assumption - * that a character in the pattern corresponds to at most a single - * character in the target string. (And I do mean character, and not byte - * here, unlike other parts of the documentation that have never been - * updated to account for multibyte Unicode.) Sharp s in EXACTF and - * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA - * nodes it can match "\x{17F}\x{17F}". These, along with other ones in - * EXACTFL nodes, violate the assumption, and they are the only instances - * where it is violated. I'm reluctant to try to change the assumption, - * as the code involved is impenetrable to me (khw), so instead the code - * here punts. This routine examines EXACTFL nodes, and (when the pattern - * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a - * boolean indicating whether or not the node contains such a fold. When - * it is true, the caller sets a flag that later causes the optimizer in - * this file to not set values for the floating and fixed string lengths, - * and thus avoids the optimizer code in regexec.c that makes the invalid - * assumption. Thus, there is no optimization based on string lengths for - * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern - * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the - * assumption is wrong only in these cases is that all other non-UTF-8 - * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to - * their expanded versions. (Again, we can't prefold sharp s to 'ss' in - * EXACTF nodes because we don't know at compile time if it actually - * matches 'ss' or not. For EXACTF nodes it will match iff the target - * string is in UTF-8. This is in contrast to EXACTFU nodes, where it - * always matches; and EXACTFAA where it never does. In an EXACTFAA node - * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the - * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 - * string would require the pattern to be forced into UTF-8, the overhead - * of which we want to avoid. Similarly the unfolded multi-char folds in - * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 - * locale.) - * - * Similarly, the code that generates tries doesn't currently handle - * not-already-folded multi-char folds, and it looks like a pain to change - * that. Therefore, trie generation of EXACTFAA nodes with the sharp s - * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, - * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people - * using /iaa matching will be doing so almost entirely with ASCII - * strings, so this should rarely be encountered in practice */ - -STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, - UV *min_subtract, bool *unfolded_multi_char, - U32 flags, regnode *val, U32 depth) -{ - /* Merge several consecutive EXACTish nodes into one. */ - - regnode *n = regnext(scan); - U32 stringok = 1; - regnode *next = REGNODE_AFTER_varies(scan); - U32 merged = 0; - U32 stopnow = 0; -#ifdef DEBUGGING - regnode *stop = scan; - DECLARE_AND_GET_RE_DEBUG_FLAGS; -#else - PERL_UNUSED_ARG(depth); -#endif - - PERL_ARGS_ASSERT_JOIN_EXACT; -#ifndef EXPERIMENTAL_INPLACESCAN - PERL_UNUSED_ARG(flags); - PERL_UNUSED_ARG(val); -#endif - DEBUG_PEEP("join", scan, depth, 0); - - assert(REGNODE_TYPE(OP(scan)) == EXACT); - - /* Look through the subsequent nodes in the chain. Skip NOTHING, merge - * EXACT ones that are mergeable to the current one. */ - while ( n - && ( REGNODE_TYPE(OP(n)) == NOTHING - || (stringok && REGNODE_TYPE(OP(n)) == EXACT)) - && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) - { - - if (OP(n) == TAIL || n > next) - stringok = 0; - if (REGNODE_TYPE(OP(n)) == NOTHING) { - DEBUG_PEEP("skip:", n, depth, 0); - NEXT_OFF(scan) += NEXT_OFF(n); - next = n + NODE_STEP_REGNODE; -#ifdef DEBUGGING - if (stringok) - stop = n; -#endif - n = regnext(n); - } - else if (stringok) { - const unsigned int oldl = STR_LEN(scan); - regnode * const nnext = regnext(n); - - /* XXX I (khw) kind of doubt that this works on platforms (should - * Perl ever run on one) where U8_MAX is above 255 because of lots - * of other assumptions */ - /* Don't join if the sum can't fit into a single node */ - if (oldl + STR_LEN(n) > U8_MAX) - break; - - /* Joining something that requires UTF-8 with something that - * doesn't, means the result requires UTF-8. */ - if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { - OP(scan) = EXACT_REQ8; - } - else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { - ; /* join is compatible, no need to change OP */ - } - else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { - OP(scan) = EXACTFU_REQ8; - } - else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { - - /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, - * which can join with EXACTFU ones. We check for this case - * here. These need to be resolved to either EXACTFU or - * EXACTF at joining time. They have nothing in them that - * would forbid them from being the more desirable EXACTFU - * nodes except that they begin and/or end with a single [Ss]. - * The reason this is problematic is because they could be - * joined in this loop with an adjacent node that ends and/or - * begins with [Ss] which would then form the sequence 'ss', - * which matches differently under /di than /ui, in which case - * EXACTFU can't be used. If the 'ss' sequence doesn't get - * formed, the nodes get absorbed into any adjacent EXACTFU - * node. And if the only adjacent node is EXACTF, they get - * absorbed into that, under the theory that a longer node is - * better than two shorter ones, even if one is EXACTFU. Note - * that EXACTFU_REQ8 is generated only for UTF-8 patterns, - * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ - - if (STRING(n)[STR_LEN(n)-1] == 's') { - - /* Here the joined node would end with 's'. If the node - * following the combination is an EXACTF one, it's better to - * join this trailing edge 's' node with that one, leaving the - * current one in 'scan' be the more desirable EXACTFU */ - if (OP(nnext) == EXACTF) { - break; - } - - OP(scan) = EXACTFU_S_EDGE; - - } /* Otherwise, the beginning 's' of the 2nd node just - becomes an interior 's' in 'scan' */ - } - else if (OP(scan) == EXACTF && OP(n) == EXACTF) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { - - /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE - * nodes. But the latter nodes can be also joined with EXACTFU - * ones, and that is a better outcome, so if the node following - * 'n' is EXACTFU, quit now so that those two can be joined - * later */ - if (OP(nnext) == EXACTFU) { - break; - } - - /* The join is compatible, and the combined node will be - * EXACTF. (These don't care if they begin or end with 's' */ - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { - if ( STRING(scan)[STR_LEN(scan)-1] == 's' - && STRING(n)[0] == 's') - { - /* When combined, we have the sequence 'ss', which means we - * have to remain /di */ - OP(scan) = EXACTF; - } - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { - if (STRING(n)[0] == 's') { - ; /* Here the join is compatible and the combined node - starts with 's', no need to change OP */ - } - else { /* Now the trailing 's' is in the interior */ - OP(scan) = EXACTFU; - } - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { - - /* The join is compatible, and the combined node will be - * EXACTF. (These don't care if they begin or end with 's' */ - OP(scan) = EXACTF; - } - else if (OP(scan) != OP(n)) { - - /* The only other compatible joinings are the same node type */ - break; - } - - DEBUG_PEEP("merg", n, depth, 0); - merged++; - - next = REGNODE_AFTER_varies(n); - NEXT_OFF(scan) += NEXT_OFF(n); - assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); - setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); - /* Now we can overwrite *n : */ - Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); -#ifdef DEBUGGING - stop = next - 1; -#endif - n = nnext; - if (stopnow) break; - } - -#ifdef EXPERIMENTAL_INPLACESCAN - if (flags && !NEXT_OFF(n)) { - DEBUG_PEEP("atch", val, depth, 0); - if (REGNODE_OFF_BY_ARG(OP(n))) { - ARG_SET(n, val - n); - } - else { - NEXT_OFF(n) = val - n; - } - stopnow = 1; - } -#endif - } - - /* This temporary node can now be turned into EXACTFU, and must, as - * regexec.c doesn't handle it */ - if (OP(scan) == EXACTFU_S_EDGE) { - OP(scan) = EXACTFU; - } - - *min_subtract = 0; - *unfolded_multi_char = FALSE; - - /* Here, all the adjacent mergeable EXACTish nodes have been merged. We - * can now analyze for sequences of problematic code points. (Prior to - * this final joining, sequences could have been split over boundaries, and - * hence missed). The sequences only happen in folding, hence for any - * non-EXACT EXACTish node */ - if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { - U8* s0 = (U8*) STRING(scan); - U8* s = s0; - U8* s_end = s0 + STR_LEN(scan); - - int total_count_delta = 0; /* Total delta number of characters that - multi-char folds expand to */ - - /* One pass is made over the node's string looking for all the - * possibilities. To avoid some tests in the loop, there are two main - * cases, for UTF-8 patterns (which can't have EXACTF nodes) and - * non-UTF-8 */ - if (UTF) { - U8* folded = NULL; - - if (OP(scan) == EXACTFL) { - U8 *d; - - /* An EXACTFL node would already have been changed to another - * node type unless there is at least one character in it that - * is problematic; likely a character whose fold definition - * won't be known until runtime, and so has yet to be folded. - * For all but the UTF-8 locale, folds are 1-1 in length, but - * to handle the UTF-8 case, we need to create a temporary - * folded copy using UTF-8 locale rules in order to analyze it. - * This is because our macros that look to see if a sequence is - * a multi-char fold assume everything is folded (otherwise the - * tests in those macros would be too complicated and slow). - * Note that here, the non-problematic folds will have already - * been done, so we can just copy such characters. We actually - * don't completely fold the EXACTFL string. We skip the - * unfolded multi-char folds, as that would just create work - * below to figure out the size they already are */ - - Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); - d = folded; - while (s < s_end) { - STRLEN s_len = UTF8SKIP(s); - if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { - Copy(s, d, s_len, U8); - d += s_len; - } - else if (is_FOLDS_TO_MULTI_utf8(s)) { - *unfolded_multi_char = TRUE; - Copy(s, d, s_len, U8); - d += s_len; - } - else if (isASCII(*s)) { - *(d++) = toFOLD(*s); - } - else { - STRLEN len; - _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); - d += len; - } - s += s_len; - } - - /* Point the remainder of the routine to look at our temporary - * folded copy */ - s = folded; - s_end = d; - } /* End of creating folded copy of EXACTFL string */ - - /* Examine the string for a multi-character fold sequence. UTF-8 - * patterns have all characters pre-folded by the time this code is - * executed */ - while (s < s_end - 1) /* Can stop 1 before the end, as minimum - length sequence we are looking for is 2 */ - { - int count = 0; /* How many characters in a multi-char fold */ - int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); - if (! len) { /* Not a multi-char fold: get next char */ - s += UTF8SKIP(s); - continue; - } - - { /* Here is a generic multi-char fold. */ - U8* multi_end = s + len; - - /* Count how many characters are in it. In the case of - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. */ - if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { - count = utf8_length(s, multi_end); - s = multi_end; - } - else { - while (s < multi_end) { - if (isASCII(*s)) { - s++; - goto next_iteration; - } - else { - s += UTF8SKIP(s); - } - count++; - } - } - } - - /* The delta is how long the sequence is minus 1 (1 is how long - * the character that folds to the sequence is) */ - total_count_delta += count - 1; - next_iteration: ; - } - - /* We created a temporary folded copy of the string in EXACTFL - * nodes. Therefore we need to be sure it doesn't go below zero, - * as the real string could be shorter */ - if (OP(scan) == EXACTFL) { - int total_chars = utf8_length((U8*) STRING(scan), - (U8*) STRING(scan) + STR_LEN(scan)); - if (total_count_delta > total_chars) { - total_count_delta = total_chars; - } - } - - *min_subtract += total_count_delta; - Safefree(folded); - } - else if (OP(scan) == EXACTFAA) { - - /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char - * fold to the ASCII range (and there are no existing ones in the - * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s. - * This character forbids trie formation (because of added - * complexity) */ -#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ - || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ - || UNICODE_DOT_DOT_VERSION > 0) - while (s < s_end) { - if (*s == LATIN_SMALL_LETTER_SHARP_S) { - OP(scan) = EXACTFAA_NO_TRIE; - *unfolded_multi_char = TRUE; - break; - } - s++; - } - } - else if (OP(scan) != EXACTFAA_NO_TRIE) { - - /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char - * folds that are all Latin1. As explained in the comments - * preceding this function, we look also for the sharp s in EXACTF - * and EXACTFL nodes; it can be in the final position. Otherwise - * we can stop looking 1 byte earlier because have to find at least - * two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) - ? s_end - : s_end -1; - - while (s < upper) { - int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); - if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S - && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) - { - *unfolded_multi_char = TRUE; - } - s++; - continue; - } - - if (len == 2 - && isALPHA_FOLD_EQ(*s, 's') - && isALPHA_FOLD_EQ(*(s+1), 's')) - { - - /* EXACTF nodes need to know that the minimum length - * changed so that a sharp s in the string can match this - * ss in the pattern, but they remain EXACTF nodes, as they - * won't match this unless the target string is in UTF-8, - * which we don't know until runtime. EXACTFL nodes can't - * transform into EXACTFU nodes */ - if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { - OP(scan) = EXACTFUP; - } - } - - *min_subtract += len - 1; - s += len; - } -#endif - } - } - -#ifdef DEBUGGING - /* Allow dumping but overwriting the collection of skipped - * ops and/or strings with fake optimized ops */ - n = REGNODE_AFTER_varies(scan); - while (n <= stop) { - OP(n) = OPTIMIZED; - FLAGS(n) = 0; - NEXT_OFF(n) = 0; - n++; - } -#endif - DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); - return stopnow; -} - -/* REx optimizer. Converts nodes into quicker variants "in place". - Finds fixed substrings. */ - -/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set - to the position after last scanned or to NULL. */ - -#define INIT_AND_WITHP \ - assert(!and_withp); \ - Newx(and_withp, 1, regnode_ssc); \ - SAVEFREEPV(and_withp) - - -static void -S_unwind_scan_frames(pTHX_ const void *p) -{ - scan_frame *f= (scan_frame *)p; - do { - scan_frame *n= f->next_frame; - Safefree(f); - f= n; - } while (f); -} - -/* Follow the next-chain of the current node and optimize away - all the NOTHINGs from it. - */ -STATIC void -S_rck_elide_nothing(pTHX_ regnode *node) -{ - PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; - - if (OP(node) != CURLYX) { - const int max = (REGNODE_OFF_BY_ARG(OP(node)) - ? I32_MAX - /* I32 may be smaller than U16 on CRAYs! */ - : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); - int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node)); - int noff; - regnode *n = node; - - /* Skip NOTHING and LONGJMP. */ - while ( - (n = regnext(n)) - && ( - (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n))) - || ((OP(n) == LONGJMP) && (noff = ARG(n))) - ) - && off + noff < max - ) { - off += noff; - } - if (REGNODE_OFF_BY_ARG(OP(node))) - ARG(node) = off; - else - NEXT_OFF(node) = off; - } - return; -} - -/* the return from this sub is the minimum length that could possibly match */ -STATIC SSize_t -S_study_chunk(pTHX_ - RExC_state_t *pRExC_state, - regnode **scanp, /* Start here (read-write). */ - SSize_t *minlenp, /* used for the minlen of substrings? */ - SSize_t *deltap, /* Write maxlen-minlen here. */ - regnode *last, /* Stop before this one. */ - scan_data_t *data, /* string data about the pattern */ - I32 stopparen, /* treat CLOSE-N as END, see GOSUB */ - U32 recursed_depth, /* how deep have we recursed via GOSUB */ - regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */ - U32 flags, /* flags controlling this call, see SCF_ flags */ - U32 depth, /* how deep have we recursed period */ - bool was_mutate_ok /* TRUE if in-place optimizations are allowed. - FALSE only if the caller (recursively) was - prohibited from modifying the regops, because - a higher caller is holding a ptr to them. */ -) -{ - /* vars about the regnodes we are working with */ - regnode *scan = *scanp; /* the current opcode we are inspecting */ - regnode *next = NULL; /* the next opcode beyond scan, tmp var */ - regnode *first_non_open = scan; /* FIXME: should this init to NULL? - the first non open regop, if the init - val IS an OPEN then we will skip past - it just after the var decls section */ - I32 code = 0; /* temp var used to hold the optype of a regop */ - - /* vars about the min and max length of the pattern */ - SSize_t min = 0; /* min length of this part of the pattern */ - SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT - this is adjusted down if we find - an ACCEPT */ - SSize_t delta = 0; /* difference between min and max length - (not accounting for stopmin) */ - - /* vars about capture buffers in the pattern */ - I32 pars = 0; /* count of OPEN opcodes */ - I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */ - - /* vars about whether this pattern contains something that can match - * infinitely long strings, eg, X* or X+ */ - int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); - int is_inf_internal = 0; /* The studied chunk is infinite */ - - /* scan_data_t (struct) is used to hold information about the substrings - * and start class we have extracted from the string */ - scan_data_t data_fake; /* temp var used for recursing in some cases */ - - SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do - trie optimizations */ - - scan_frame *frame = NULL; /* used as part of fake recursion */ - - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_STUDY_CHUNK; - RExC_study_started= 1; - - Zero(&data_fake, 1, scan_data_t); - - if ( depth == 0 ) { - while (first_non_open && OP(first_non_open) == OPEN) - first_non_open=regnext(first_non_open); - } - - fake_study_recurse: - DEBUG_r( - RExC_study_chunk_recursed_count++; - ); - DEBUG_OPTIMISE_MORE_r( - { - Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", - depth, (long)stopparen, - (unsigned long)RExC_study_chunk_recursed_count, - (unsigned long)depth, (unsigned long)recursed_depth, - scan, - last); - if (recursed_depth) { - U32 i; - U32 j; - for ( j = 0 ; j < recursed_depth ; j++ ) { - for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { - if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { - Perl_re_printf( aTHX_ " %d",(int)i); - break; - } - } - if ( j + 1 < recursed_depth ) { - Perl_re_printf( aTHX_ ","); - } - } - } - Perl_re_printf( aTHX_ "\n"); - } - ); - while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How mmany chars to subtract from the minimum - node length to get a real minimum (because - the folded version may be shorter) */ - bool unfolded_multi_char = FALSE; - /* avoid mutating ops if we are anywhere within the recursed or - * enframed handling for a GOSUB: the outermost level will handle it. - */ - bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); - /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("Peep", scan, depth, flags); - - - /* The reason we do this here is that we need to deal with things like - * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT - * parsing code, as each (?:..) is handled by a different invocation of - * reg() -- Yves - */ - if (REGNODE_TYPE(OP(scan)) == EXACT - && OP(scan) != LEXACT - && OP(scan) != LEXACT_REQ8 - && mutate_ok - ) { - join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, - 0, NULL, depth + 1); - } - - /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. - */ - rck_elide_nothing(scan); - - /* The principal pseudo-switch. Cannot be a switch, since we look into - * several different things. */ - if ( OP(scan) == DEFINEP ) { - SSize_t minlen = 0; - SSize_t deltanext = 0; - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY); - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - scan = regnext(scan); - assert( OP(scan) == IFTHEN ); - DEBUG_PEEP("expect IFTHEN", scan, depth, flags); - - data_fake.last_closep= &fake_last_close; - data_fake.last_close_opp= &fake_last_close_op; - minlen = *minlenp; - next = regnext(scan); - scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN); - DEBUG_PEEP("scan", scan, depth, flags); - DEBUG_PEEP("next", next, depth, flags); - - /* we suppose the run is continuous, last=next... - * NOTE we dont use the return here! */ - /* DEFINEP study_chunk() recursion */ - (void)study_chunk(pRExC_state, &scan, &minlen, - &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, mutate_ok); - - scan = next; - } else - if ( - OP(scan) == BRANCH || - OP(scan) == BRANCHJ || - OP(scan) == IFTHEN - ) { - next = regnext(scan); - code = OP(scan); - - /* The op(next)==code check below is to see if we - * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" - * IFTHEN is special as it might not appear in pairs. - * Not sure whether BRANCH-BRANCHJ is possible, regardless - * we dont handle it cleanly. */ - if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for - * handling TRIE nodes on a re-study. If you change stuff here - * check there too. */ - SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0; - regnode_ssc accum; - regnode * const startbranch=scan; - - if (flags & SCF_DO_SUBSTR) { - /* Cannot merge strings after this. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - } - - if (flags & SCF_DO_STCLASS) - ssc_init_zero(pRExC_state, &accum); - - while (OP(scan) == code) { - SSize_t deltanext, minnext, fake_last_close = 0; - regnode *fake_last_close_op = NULL; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - regnode_ssc this_class; - - DEBUG_PEEP("Branch", scan, depth, flags); - - num++; - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - - data_fake.pos_delta = delta; - next = regnext(scan); - - scan = REGNODE_AFTER_opcode(scan, code); - - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - data_fake.start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - - /* we suppose the run is continuous, last=next...*/ - /* recurse study_chunk() for each BRANCH in an alternation */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, - mutate_ok); - - if (min1 > minnext) - min1 = minnext; - if (deltanext == OPTIMIZE_INFTY) { - is_inf = is_inf_internal = 1; - max1 = OPTIMIZE_INFTY; - } else if (max1 < minnext + deltanext) - max1 = minnext + deltanext; - scan = next; - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > minnext) - stopmin = min + min1; - flags &= ~SCF_DO_SUBSTR; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - } - if (data) { - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); - DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta); - } - if (code == IFTHEN && num < 2) /* Empty ELSE branch */ - min1 = 0; - if (flags & SCF_DO_SUBSTR) { - data->pos_min += min1; - if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1)) - data->pos_delta = OPTIMIZE_INFTY; - else - data->pos_delta += max1 - min1; - if (max1 != min1 || is_inf) - data->cur_is_floating = 1; - } - min += min1; - if (delta == OPTIMIZE_INFTY - || OPTIMIZE_INFTY - delta - (max1 - min1) < 0) - delta = OPTIMIZE_INFTY; - else - delta += max1 - min1; - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (flags & SCF_DO_STCLASS_AND) { - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); - flags &= ~SCF_DO_STCLASS; - } - else { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - } - } - DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta); - - if (PERL_ENABLE_TRIE_OPTIMISATION - && OP(startbranch) == BRANCH - && mutate_ok - ) { - /* demq. - - Assuming this was/is a branch we are dealing with: 'scan' - now points at the item that follows the branch sequence, - whatever it is. We now start at the beginning of the - sequence and look for subsequences of - - BRANCH->EXACT=>x1 - BRANCH->EXACT=>x2 - tail - - which would be constructed from a pattern like - /A|LIST|OF|WORDS/ - - If we can find such a subsequence we need to turn the first - element into a trie and then add the subsequent branch exact - strings to the trie. - - We have two cases - - 1. patterns where the whole set of branches can be - converted. - - 2. patterns where only a subset can be converted. - - In case 1 we can replace the whole set with a single regop - for the trie. In case 2 we need to keep the start and end - branches so - - 'BRANCH EXACT; BRANCH EXACT; BRANCH X' - becomes BRANCH TRIE; BRANCH X; - - There is an additional case, that being where there is a - common prefix, which gets split out into an EXACT like node - preceding the TRIE node. - - If X(1..n)==tail then we can do a simple trie, if not we make - a "jump" trie, such that when we match the appropriate word - we "jump" to the appropriate tail node. Essentially we turn - a nested if into a case structure of sorts. - - */ - - int made=0; - if (!re_trie_maxbuff) { - re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); - if (!SvIOK(re_trie_maxbuff)) - sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); - } - if ( SvIV(re_trie_maxbuff)>=0 ) { - regnode *cur; - regnode *first = (regnode *)NULL; - regnode *prev = (regnode *)NULL; - regnode *tail = scan; - U8 trietype = 0; - U32 count=0; - - /* var tail is used because there may be a TAIL - regop in the way. Ie, the exacts will point to the - thing following the TAIL, but the last branch will - point at the TAIL. So we advance tail. If we - have nested (?:) we may have to move through several - tails. - */ - - while ( OP( tail ) == TAIL ) { - /* this is the TAIL generated by (?:) */ - tail = regnext( tail ); - } - - - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", - depth+1, - "Looking for TRIE'able sequences. Tail node is ", - (UV) REGNODE_OFFSET(tail), - SvPV_nolen_const( RExC_mysv ) - ); - }); - - /* - - Step through the branches - cur represents each branch, - noper is the first thing to be matched as part - of that branch - noper_next is the regnext() of that node. - - We normally handle a case like this - /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also - support building with NOJUMPTRIE, which restricts - the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is - a possible optimization target. If we are building - under NOJUMPTRIE then we require that noper_next is - the same as scan (our current position in the regex - program). - - Once we have two or more consecutive such branches - we can create a trie of the EXACT's contents and - stitch it in place into the program. - - If the sequence represents all of the branches in - the alternation we replace the entire thing with a - single TRIE node. - - Otherwise when it is a subsequence we need to - stitch it in place and replace only the relevant - branches. This means the first branch has to remain - as it is used by the alternation logic, and its - next pointer, and needs to be repointed at the item - on the branch chain following the last branch we - have optimized away. - - This could be either a BRANCH, in which case the - subsequence is internal, or it could be the item - following the branch sequence in which case the - subsequence is at the end (which does not - necessarily mean the first node is the start of the - alternation). - - TRIE_TYPE(X) is a define which maps the optype to a - trietype. - - optype | trietype - ----------------+----------- - NOTHING | NOTHING - EXACT | EXACT - EXACT_REQ8 | EXACT - EXACTFU | EXACTFU - EXACTFU_REQ8 | EXACTFU - EXACTFUP | EXACTFU - EXACTFAA | EXACTFAA - EXACTL | EXACTL - EXACTFLU8 | EXACTFLU8 - - - */ -#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ - ? NOTHING \ - : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ - ? EXACT \ - : ( EXACTFU == (X) \ - || EXACTFU_REQ8 == (X) \ - || EXACTFUP == (X) ) \ - ? EXACTFU \ - : ( EXACTFAA == (X) ) \ - ? EXACTFAA \ - : ( EXACTL == (X) ) \ - ? EXACTL \ - : ( EXACTFLU8 == (X) ) \ - ? EXACTFLU8 \ - : 0 ) - - /* dont use tail as the end marker for this traverse */ - for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { - regnode * const noper = REGNODE_AFTER( cur ); - U8 noper_type = OP( noper ); - U8 noper_trietype = TRIE_TYPE( noper_type ); -#if defined(DEBUGGING) || defined(NOJUMPTRIE) - regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; -#endif - - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %d:%s (%d)", - depth+1, - REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); - - regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); - Perl_re_printf( aTHX_ " -> %d:%s", - REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); - - if ( noper_next ) { - regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); - Perl_re_printf( aTHX_ "\t=> %d:%s\t", - REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); - } - Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), - REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype) - ); - }); - - /* Is noper a trieable nodetype that can be merged - * with the current trie (if there is one)? */ - if ( noper_trietype - && - ( - ( noper_trietype == NOTHING ) - || ( trietype == NOTHING ) - || ( trietype == noper_trietype ) - ) -#ifdef NOJUMPTRIE - && noper_next >= tail -#endif - && count < U16_MAX) - { - /* Handle mergable triable node Either we are - * the first node in a new trieable sequence, - * in which case we do some bookkeeping, - * otherwise we update the end pointer. */ - if ( !first ) { - first = cur; - if ( noper_trietype == NOTHING ) { -#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) - regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; -#endif - - if ( noper_next_trietype ) { - trietype = noper_next_trietype; - } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. - * We need at least two for a trie - * so we can't merge this in */ - first = NULL; - } - } else { - trietype = noper_trietype; - } - } else { - if ( trietype == NOTHING ) - trietype = noper_trietype; - prev = cur; - } - if (first) - count++; - } /* end handle mergable triable node */ - else { - /* handle unmergable node - - * noper may either be a triable node which can - * not be tried together with the current trie, - * or a non triable node */ - if ( prev ) { - /* If last is set and trietype is not - * NOTHING then we have found at least two - * triable branch sequences in a row of a - * similar trietype so we can turn them - * into a trie. If/when we allow NOTHING to - * start a trie sequence this condition - * will be required, and it isn't expensive - * so we leave it in for now. */ - if ( trietype && trietype != NOTHING ) - make_trie( pRExC_state, - startbranch, first, cur, tail, - count, trietype, depth+1 ); - prev = NULL; /* note: we clear/update - first, trietype etc below, - so we dont do it here */ - } - if ( noper_trietype -#ifdef NOJUMPTRIE - && noper_next >= tail -#endif - ){ - /* noper is triable, so we can start a new - * trie sequence */ - count = 1; - first = cur; - trietype = noper_trietype; - } else if (first) { - /* if we already saw a first but the - * current node is not triable then we have - * to reset the first information. */ - count = 0; - first = NULL; - trietype = 0; - } - } /* end handle unmergable node */ - } /* loop over branches */ - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", - depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); - Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), - REGNODE_NAME(trietype) - ); - - }); - if ( prev && trietype ) { - if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of - * a trie, so we have to construct it here - * outside of the loop */ - made= make_trie( pRExC_state, startbranch, - first, scan, tail, count, - trietype, depth+1 ); -#ifdef TRIE_STUDY_OPT - if ( ((made == MADE_EXACT_TRIE && - startbranch == first) - || ( first_non_open == first )) && - depth==0 ) { - flags |= SCF_TRIE_RESTUDY; - if ( startbranch == first - && scan >= tail ) - { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; - } - } -#endif - } else { - /* at this point we know whatever we have is a - * NOTHING sequence/branch AND if 'startbranch' - * is 'first' then we can turn the whole thing - * into a NOTHING - */ - if ( startbranch == first ) { - regnode *opt; - /* the entire thing is a NOTHING sequence, - * something like this: (?:|) So we can - * turn it into a plain NOTHING op. */ - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", - depth+1, - SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); - - }); - OP(startbranch)= NOTHING; - NEXT_OFF(startbranch)= tail - startbranch; - for ( opt= startbranch + 1; opt < tail ; opt++ ) - OP(opt)= OPTIMIZED; - } - } - } /* end if ( prev) */ - } /* TRIE_MAXBUF is non zero */ - } /* do trie */ - DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta); - } - else - scan = REGNODE_AFTER_opcode(scan,code); - continue; - } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { - I32 paren = 0; - regnode *start = NULL; - regnode *end = NULL; - U32 my_recursed_depth= recursed_depth; - - if (OP(scan) != SUSPEND) { /* GOSUB */ - /* Do setup, note this code has side effects beyond - * the rest of this block. Specifically setting - * RExC_recurse[] must happen at least once during - * study_chunk(). */ - paren = ARG(scan); - RExC_recurse[ARG2L(scan)] = scan; - start = REGNODE_p(RExC_open_parens[paren]); - end = REGNODE_p(RExC_close_parens[paren]); - - /* NOTE we MUST always execute the above code, even - * if we do nothing with a GOSUB */ - if ( - ( flags & SCF_IN_DEFINE ) - || - ( - (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) - && - ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) - ) - ) { - /* no need to do anything here if we are in a define. */ - /* or we are after some kind of infinite construct - * so we can skip recursing into this item. - * Since it is infinite we will not change the maxlen - * or delta, and if we miss something that might raise - * the minlen it will merely pessimise a little. - * - * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ - * might result in a minlen of 1 and not of 4, - * but this doesn't make us mismatch, just try a bit - * harder than we should. - * - * However we must assume this GOSUB is infinite, to - * avoid wrongly applying other optimizations in the - * enclosing scope - see GH 18096, for example. - */ - is_inf = is_inf_internal = 1; - scan= regnext(scan); - continue; - } - - if ( - !recursed_depth - || !PAREN_TEST(recursed_depth - 1, paren) - ) { - /* it is quite possible that there are more efficient ways - * to do this. We maintain a bitmap per level of recursion - * of which patterns we have entered so we can detect if a - * pattern creates a possible infinite loop. When we - * recurse down a level we copy the previous levels bitmap - * down. When we are at recursion level 0 we zero the top - * level bitmap. It would be nice to implement a different - * more efficient way of doing this. In particular the top - * level bitmap may be unnecessary. - */ - if (!recursed_depth) { - Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); - } else { - Copy(PAREN_OFFSET(recursed_depth - 1), - PAREN_OFFSET(recursed_depth), - RExC_study_chunk_recursed_bytes, U8); - } - /* we havent recursed into this paren yet, so recurse into it */ - DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta); - PAREN_SET(recursed_depth, paren); - my_recursed_depth= recursed_depth + 1; - } else { - DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta); - /* some form of infinite recursion, assume infinite length - * */ - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_anything(data->start_class); - flags &= ~SCF_DO_STCLASS; - - start= NULL; /* reset start so we dont recurse later on. */ - } - } else { - paren = stopparen; - start = scan + 2; - end = regnext(scan); - } - if (start) { - scan_frame *newframe; - assert(end); - if (!RExC_frame_last) { - Newxz(newframe, 1, scan_frame); - SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); - RExC_frame_head= newframe; - RExC_frame_count++; - } else if (!RExC_frame_last->next_frame) { - Newxz(newframe, 1, scan_frame); - RExC_frame_last->next_frame= newframe; - newframe->prev_frame= RExC_frame_last; - RExC_frame_count++; - } else { - newframe= RExC_frame_last->next_frame; - } - RExC_frame_last= newframe; - - newframe->next_regnode = regnext(scan); - newframe->last_regnode = last; - newframe->stopparen = stopparen; - newframe->prev_recursed_depth = recursed_depth; - newframe->this_prev_frame= frame; - newframe->in_gosub = ( - (frame && frame->in_gosub) || OP(scan) == GOSUB - ); - - DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("fnew", scan, depth, flags); - - frame = newframe; - scan = start; - stopparen = paren; - last = end; - depth = depth + 1; - recursed_depth= my_recursed_depth; - - continue; - } - } - else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) { - SSize_t bytelen = STR_LEN(scan), charlen; - UV uc; - assert(bytelen); - if (UTF) { - const U8 * const s = (U8*)STRING(scan); - uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); - charlen = utf8_length(s, s + bytelen); - } else { - uc = *((U8*)STRING(scan)); - charlen = bytelen; - } - min += charlen; - if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ - /* The code below prefers earlier match for fixed - offset, later match for variable offset. */ - if (data->last_end == -1) { /* Update the start info. */ - data->last_start_min = data->pos_min; - data->last_start_max = - is_inf ? OPTIMIZE_INFTY - : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) - ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; - } - sv_catpvn(data->last_found, STRING(scan), bytelen); - if (UTF) - SvUTF8_on(data->last_found); - { - SV * const sv = data->last_found; - MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - if (mg && mg->mg_len >= 0) - mg->mg_len += charlen; - } - data->last_end = data->pos_min + charlen; - data->pos_min += charlen; /* As in the first entry. */ - data->flags &= ~SF_BEFORE_EOL; - } - - /* ANDing the code point leaves at most it, and not in locale, and - * can't match null string */ - if (flags & SCF_DO_STCLASS_AND) { - ssc_cp_and(data->start_class, uc); - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - ssc_clear_locale(data->start_class); - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_add_cp(data->start_class, uc); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta); - } - else if (REGNODE_TYPE(OP(scan)) == EXACT) { - /* But OP != EXACT!, so is EXACTFish */ - SSize_t bytelen = STR_LEN(scan), charlen; - const U8 * s = (U8*)STRING(scan); - - /* Replace a length 1 ASCII fold pair node with an ANYOFM node, - * with the mask set to the complement of the bit that differs - * between upper and lower case, and the lowest code point of the - * pair (which the '&' forces) */ - if ( bytelen == 1 - && isALPHA_A(*s) - && ( OP(scan) == EXACTFAA - || ( OP(scan) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) - && mutate_ok - ) { - U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ - - OP(scan) = ANYOFM; - ARG_SET(scan, *s & mask); - FLAGS(scan) = mask; - /* We're not EXACTFish any more, so restudy. - * Search for "restudy" in this file to find - * a comment with details. */ - continue; - } - - /* Search for fixed substrings supports EXACT only. */ - if (flags & SCF_DO_SUBSTR) { - assert(data); - scan_commit(pRExC_state, data, minlenp, is_inf); - } - charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; - if (unfolded_multi_char) { - RExC_seen |= REG_UNFOLDED_MULTI_SEEN; - } - min += charlen - min_subtract; - assert (min >= 0); - if ((SSize_t)min_subtract < OPTIMIZE_INFTY - && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract - ) { - delta += min_subtract; - } else { - delta = OPTIMIZE_INFTY; - } - if (flags & SCF_DO_SUBSTR) { - data->pos_min += charlen - min_subtract; - if (data->pos_min < 0) { - data->pos_min = 0; - } - if ((SSize_t)min_subtract < OPTIMIZE_INFTY - && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract - ) { - data->pos_delta += min_subtract; - } else { - data->pos_delta = OPTIMIZE_INFTY; - } - if (min_subtract) { - data->cur_is_floating = 1; /* float */ - } - } - - if (flags & SCF_DO_STCLASS) { - SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); - - assert(EXACTF_invlist); - if (flags & SCF_DO_STCLASS_AND) { - if (OP(scan) != EXACTFL) - ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - ANYOF_POSIXL_ZERO(data->start_class); - ssc_intersection(data->start_class, EXACTF_invlist, FALSE); - } - else { /* SCF_DO_STCLASS_OR */ - ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - SvREFCNT_dec(EXACTF_invlist); - } - DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta); - } - else if (REGNODE_VARIES(OP(scan))) { - SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; - I32 fl = 0; - U32 f = flags; - regnode * const oscan = scan; - regnode_ssc this_class; - regnode_ssc *oclass = NULL; - I32 next_is_eval = 0; - - switch (REGNODE_TYPE(OP(scan))) { - case WHILEM: /* End of (?:...)* . */ - scan = REGNODE_AFTER(scan); - goto finish; - case PLUS: - if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { - next = REGNODE_AFTER(scan); - if ( ( REGNODE_TYPE(OP(next)) == EXACT - && ! isEXACTFish(OP(next))) - || (flags & SCF_DO_STCLASS)) - { - mincount = 1; - maxcount = REG_INFTY; - next = regnext(scan); - scan = REGNODE_AFTER(scan); - goto do_curly; - } - } - if (flags & SCF_DO_SUBSTR) - data->pos_min++; - /* This will bypass the formal 'min += minnext * mincount' - * calculation in the do_curly path, so assumes min width - * of the PLUS payload is exactly one. */ - min++; - /* FALLTHROUGH */ - case STAR: - next = REGNODE_AFTER(scan); - - /* This temporary node can now be turned into EXACTFU, and - * must, as regexec.c doesn't handle it */ - if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { - OP(next) = EXACTFU; - } - - if ( STR_LEN(next) == 1 - && isALPHA_A(* STRING(next)) - && ( OP(next) == EXACTFAA - || ( OP(next) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) - && mutate_ok - ) { - /* These differ in just one bit */ - U8 mask = ~ ('A' ^ 'a'); - - assert(isALPHA_A(* STRING(next))); - - /* Then replace it by an ANYOFM node, with - * the mask set to the complement of the - * bit that differs between upper and lower - * case, and the lowest code point of the - * pair (which the '&' forces) */ - OP(next) = ANYOFM; - ARG_SET(next, *STRING(next) & mask); - FLAGS(next) = mask; - } - - if (flags & SCF_DO_STCLASS) { - mincount = 0; - maxcount = REG_INFTY; - next = regnext(scan); - scan = REGNODE_AFTER(scan); - goto do_curly; - } - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - /* Cannot extend fixed substrings */ - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - scan = regnext(scan); - goto optimize_curly_tail; - case CURLY: - if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) - && (scan->flags == stopparen)) - { - mincount = 1; - maxcount = 1; - } else { - mincount = ARG1(scan); - maxcount = ARG2(scan); - } - next = regnext(scan); - if (OP(scan) == CURLYX) { - I32 lp = (data ? *(data->last_closep) : 0); - scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); - } - scan = REGNODE_AFTER(scan); - next_is_eval = (OP(scan) == EVAL); - do_curly: - if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) - scan_commit(pRExC_state, data, minlenp, is_inf); - /* Cannot extend fixed substrings */ - pos_before = data->pos_min; - } - if (data) { - fl = data->flags; - data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); - if (is_inf) - data->flags |= SF_IS_INF; - } - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - oclass = data->start_class; - data->start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - f &= ~SCF_DO_STCLASS_OR; - } - /* Exclude from super-linear cache processing any {n,m} - regops for which the combination of input pos and regex - pos is not enough information to determine if a match - will be possible. - - For example, in the regex /foo(bar\s*){4,8}baz/ with the - regex pos at the \s*, the prospects for a match depend not - only on the input position but also on how many (bar\s*) - repeats into the {4,8} we are. */ - if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) - f &= ~SCF_WHILEM_VISITED_POS; - - /* This will finish on WHILEM, setting scan, or on NULL: */ - /* recurse study_chunk() on loop bodies */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed_depth, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) - : f) - , depth+1, mutate_ok); - - if (data && data->flags & SCF_SEEN_ACCEPT) { - if (mincount > 1) - mincount = 1; - } - - if (flags & SCF_DO_STCLASS) - data->start_class = oclass; - if (mincount == 0 || minnext == 0) { - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - } - else if (flags & SCF_DO_STCLASS_AND) { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - ANYOF_FLAGS(data->start_class) - |= SSC_MATCHES_EMPTY_STRING; - } - } else { /* Non-zero len */ - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - } - else if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - flags &= ~SCF_DO_STCLASS; - } - if (!scan) /* It was not CURLYX, but CURLY. */ - scan = next; - if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) - /* ? quantifier ok, except for (?{ ... }) */ - && (next_is_eval || !(mincount == 0 && maxcount == 1)) - && (minnext == 0) && (deltanext == 0) - && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big - count */ - { - _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), - "Quantifier unexpected on zero-length expression " - "in regex m/%" UTF8f "/", - UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, - RExC_precomp))); - } - - if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) - || min >= SSize_t_MAX - minnext * mincount ) - { - FAIL("Regexp out of space"); - } - - min += minnext * mincount; - is_inf_internal |= deltanext == OPTIMIZE_INFTY - || (maxcount == REG_INFTY && minnext + deltanext > 0); - is_inf |= is_inf_internal; - if (is_inf) { - delta = OPTIMIZE_INFTY; - } else { - delta += (minnext + deltanext) * maxcount - - minnext * mincount; - } - - if (data && data->flags & SCF_SEEN_ACCEPT) { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - if (stopmin > min) - stopmin = min; - DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta); - } - /* Try powerful optimization CURLYX => CURLYN. */ - if ( OP(oscan) == CURLYX && data - && data->flags & SF_IN_PAR - && !(data->flags & SF_HAS_EVAL) - && !deltanext && minnext == 1 - && mutate_ok - ) { - /* Try to optimize to CURLYN. */ - regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); - regnode * const nxt1 = nxt; -#ifdef DEBUGGING - regnode *nxt2; -#endif - - /* Skip open. */ - nxt = regnext(nxt); - if (!REGNODE_SIMPLE(OP(nxt)) - && !(REGNODE_TYPE(OP(nxt)) == EXACT - && STR_LEN(nxt) == 1)) - goto nogo; -#ifdef DEBUGGING - nxt2 = nxt; -#endif - nxt = regnext(nxt); - if (OP(nxt) != CLOSE) - goto nogo; - if (RExC_open_parens) { - - /*open->CURLYM*/ - RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); - - /*close->while*/ - RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2; - } - /* Now we know that nxt2 is the only contents: */ - oscan->flags = (U8)PARNO(nxt); - OP(oscan) = CURLYN; - OP(nxt1) = NOTHING; /* was OPEN. */ - -#ifdef DEBUGGING - OP(nxt1 + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ - NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ - OP(nxt) = OPTIMIZED; /* was CLOSE. */ - OP(nxt + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ -#endif - } - nogo: - - /* Try optimization CURLYX => CURLYM. */ - if ( OP(oscan) == CURLYX && data - && !(data->flags & SF_HAS_PAR) - && !(data->flags & SF_HAS_EVAL) - && !deltanext /* atom is fixed width */ - && minnext != 0 /* CURLYM can't handle zero width */ - /* Nor characters whose fold at run-time may be - * multi-character */ - && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) - && mutate_ok - ) { - /* XXXX How to optimize if data == 0? */ - /* Optimize to a simpler form. */ - regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */ - regnode *nxt2; - - OP(oscan) = CURLYM; - while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ - && (OP(nxt2) != WHILEM)) - nxt = nxt2; - OP(nxt2) = SUCCEED; /* Whas WHILEM */ - /* Need to optimize away parenths. */ - if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { - /* Set the parenth number. */ - /* note that we have changed the type of oscan to CURLYM here */ - regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/ - - oscan->flags = (U8)PARNO(nxt); - if (RExC_open_parens) { - /*open->CURLYM*/ - RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); - - /*close->NOTHING*/ - RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2) - + 1; - } - OP(nxt1) = OPTIMIZED; /* was OPEN. */ - OP(nxt) = OPTIMIZED; /* was CLOSE. */ - -#ifdef DEBUGGING - OP(nxt1 + 1) = OPTIMIZED; /* was count. */ - OP(nxt + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ - NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ -#endif -#if 0 - while ( nxt1 && (OP(nxt1) != WHILEM)) { - regnode *nnxt = regnext(nxt1); - if (nnxt == nxt) { - if (REGNODE_OFF_BY_ARG(OP(nxt1))) - ARG_SET(nxt1, nxt2 - nxt1); - else if (nxt2 - nxt1 < U16_MAX) - NEXT_OFF(nxt1) = nxt2 - nxt1; - else - OP(nxt) = NOTHING; /* Cannot beautify */ - } - nxt1 = nnxt; - } -#endif - /* Optimize again: */ - /* recurse study_chunk() on optimised CURLYX => CURLYM */ - study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed_depth, NULL, 0, - depth+1, mutate_ok); - } - else - oscan->flags = 0; - } - else if ((OP(oscan) == CURLYX) - && (flags & SCF_WHILEM_VISITED_POS) - /* See the comment on a similar expression above. - However, this time it's not a subexpression - we care about, but the expression itself. */ - && (maxcount == REG_INFTY) - && data) { - /* This stays as CURLYX, we can put the count/of pair. */ - /* Find WHILEM (as in regexec.c) */ - regnode *nxt = oscan + NEXT_OFF(oscan); - - if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */ - nxt += ARG(nxt); - nxt = REGNODE_BEFORE(nxt); - if (nxt->flags & 0xf) { - /* we've already set whilem count on this node */ - } else if (++data->whilem_c < 16) { - assert(data->whilem_c <= RExC_whilem_seen); - nxt->flags = (U8)(data->whilem_c - | (RExC_whilem_seen << 4)); /* On WHILEM */ - } - } - if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (flags & SCF_DO_SUBSTR) { - SV *last_str = NULL; - STRLEN last_chrs = 0; - int counted = mincount != 0; - - if (data->last_end > 0 && mincount != 0) { /* Ends with a - string. */ - SSize_t b = pos_before >= data->last_start_min - ? pos_before : data->last_start_min; - STRLEN l; - const char * const s = SvPV_const(data->last_found, l); - SSize_t old = b - data->last_start_min; - assert(old >= 0); - - if (UTF) - old = utf8_hop_forward((U8*)s, old, - (U8 *) SvEND(data->last_found)) - - (U8*)s; - l -= old; - /* Get the added string: */ - last_str = newSVpvn_utf8(s + old, l, UTF); - last_chrs = UTF ? utf8_length((U8*)(s + old), - (U8*)(s + old + l)) : l; - if (deltanext == 0 && pos_before == b) { - /* What was added is a constant string */ - if (mincount > 1) { - - SvGROW(last_str, (mincount * l) + 1); - repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, - mincount - 1); - SvCUR_set(last_str, SvCUR(last_str) * mincount); - /* Add additional parts. */ - SvCUR_set(data->last_found, - SvCUR(data->last_found) - l); - sv_catsv(data->last_found, last_str); - { - SV * sv = data->last_found; - MAGIC *mg = - SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - if (mg && mg->mg_len >= 0) - mg->mg_len += last_chrs * (mincount-1); - } - last_chrs *= mincount; - data->last_end += l * (mincount - 1); - } - } else { - /* start offset must point into the last copy */ - data->last_start_min += minnext * (mincount - 1); - data->last_start_max = - is_inf - ? OPTIMIZE_INFTY - : data->last_start_max + - (maxcount - 1) * (minnext + data->pos_delta); - } - } - /* It is counted once already... */ - data->pos_min += minnext * (mincount - counted); -#if 0 - Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf - " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf - " maxcount=%" UVuf " mincount=%" UVuf - " data->pos_delta=%" UVuf "\n", - (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, - (UV)maxcount, (UV)mincount, (UV)data->pos_delta); - if (deltanext != OPTIMIZE_INFTY) - Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", - (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta)); -#endif - if (deltanext == OPTIMIZE_INFTY - || data->pos_delta == OPTIMIZE_INFTY - || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta) - data->pos_delta = OPTIMIZE_INFTY; - else - data->pos_delta += - counted * deltanext + - (minnext + deltanext) * maxcount - minnext * mincount; - if (mincount != maxcount) { - /* Cannot extend fixed substrings found inside - the group. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - if (mincount && last_str) { - SV * const sv = data->last_found; - MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - - if (mg) - mg->mg_len = -1; - sv_setsv(sv, last_str); - data->last_end = data->pos_min; - data->last_start_min = data->pos_min - last_chrs; - data->last_start_max = is_inf - ? OPTIMIZE_INFTY - : data->pos_min + data->pos_delta - last_chrs; - } - data->cur_is_floating = 1; /* float */ - } - SvREFCNT_dec(last_str); - } - if (data && (fl & SF_HAS_EVAL)) - data->flags |= SF_HAS_EVAL; - optimize_curly_tail: - rck_elide_nothing(oscan); - continue; - - default: - Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", - OP(scan)); - case REF: - case CLUMP: - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) { - if (OP(scan) == CLUMP) { - /* Actually is any start char, but very few code points - * aren't start characters */ - ssc_match_all_cp(data->start_class); - } - else { - ssc_anything(data->start_class); - } - } - flags &= ~SCF_DO_STCLASS; - break; - } - } - else if (OP(scan) == LNBREAK) { - if (flags & SCF_DO_STCLASS) { - if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, - PL_XPosix_ptrs[CC_VERTSPACE_], FALSE); - ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) - &= ~SSC_MATCHES_EMPTY_STRING; - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, - PL_XPosix_ptrs[CC_VERTSPACE_], - FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg for - * 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) - &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - } - min++; - if (delta != OPTIMIZE_INFTY) - delta++; /* Because of the 2 char string cr-lf */ - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min += 1; - if (data->pos_delta != OPTIMIZE_INFTY) { - data->pos_delta += 1; - } - data->cur_is_floating = 1; /* float */ - } - } - else if (REGNODE_SIMPLE(OP(scan))) { - - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min++; - } - min++; - if (flags & SCF_DO_STCLASS) { - bool invert = 0; - SV* my_invlist = NULL; - U8 namedclass; - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - - /* Some of the logic below assumes that switching - locale on will only add false positives. */ - switch (OP(scan)) { - - default: -#ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", - OP(scan)); -#endif - case SANY: - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_match_all_cp(data->start_class); - break; - - case REG_ANY: - { - SV* REG_ANY_invlist = _new_invlist(2); - REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, - '\n'); - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, - REG_ANY_invlist, - TRUE /* TRUE => invert, hence all but \n - */ - ); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, - REG_ANY_invlist, - TRUE /* TRUE => invert */ - ); - ssc_clear_locale(data->start_class); - } - SvREFCNT_dec_NN(REG_ANY_invlist); - } - break; - - case ANYOFD: - case ANYOFL: - case ANYOFPOSIXL: - case ANYOFH: - case ANYOFHb: - case ANYOFHr: - case ANYOFHs: - case ANYOF: - if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, - (regnode_charclass *) scan); - else - ssc_or(pRExC_state, data->start_class, - (regnode_charclass *) scan); - break; - - case ANYOFHbbm: - { - SV* cp_list = get_ANYOFHbbm_contents(scan); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case NANYOFM: /* NANYOFM already contains the inversion of the - input ANYOF data, so, unlike things like - NPOSIXA, don't change 'invert' to TRUE */ - /* FALLTHROUGH */ - case ANYOFM: - { - SV* cp_list = get_ANYOFM_contents(scan); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case ANYOFR: - case ANYOFRb: - { - SV* cp_list = NULL; - - cp_list = _add_range_to_invlist(cp_list, - ANYOFRbase(scan), - ANYOFRbase(scan) + ANYOFRdelta(scan)); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case NPOSIXL: - invert = 1; - /* FALLTHROUGH */ - - case POSIXL: - namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; - if (flags & SCF_DO_STCLASS_AND) { - bool was_there = cBOOL( - ANYOF_POSIXL_TEST(data->start_class, - namedclass)); - ANYOF_POSIXL_ZERO(data->start_class); - if (was_there) { /* Do an AND */ - ANYOF_POSIXL_SET(data->start_class, namedclass); - } - /* No individual code points can now match */ - data->start_class->invlist - = sv_2mortal(_new_invlist(0)); - } - else { - int complement = namedclass + ((invert) ? -1 : 1); - - assert(flags & SCF_DO_STCLASS_OR); - - /* If the complement of this class was already there, - * the result is that they match all code points, - * (\d + \D == everything). Remove the classes from - * future consideration. Locale is not relevant in - * this case */ - if (ANYOF_POSIXL_TEST(data->start_class, complement)) { - ssc_match_all_cp(data->start_class); - ANYOF_POSIXL_CLEAR(data->start_class, namedclass); - ANYOF_POSIXL_CLEAR(data->start_class, complement); - } - else { /* The usual case; just add this class to the - existing set */ - ANYOF_POSIXL_SET(data->start_class, namedclass); - } - } - break; - - case NPOSIXA: /* For these, we always know the exact set of - what's matched */ - invert = 1; - /* FALLTHROUGH */ - case POSIXA: - my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); - goto join_posix_and_ascii; - - case NPOSIXD: - case NPOSIXU: - invert = 1; - /* FALLTHROUGH */ - case POSIXD: - case POSIXU: - my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); - - /* NPOSIXD matches all upper Latin1 code points unless the - * target string being matched is UTF-8, which is - * unknowable until match time. Since we are going to - * invert, we want to get rid of all of them so that the - * inversion will match all */ - if (OP(scan) == NPOSIXD) { - _invlist_subtract(my_invlist, PL_UpperLatin1, - &my_invlist); - } - - join_posix_and_ascii: - - if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, my_invlist, invert); - ssc_clear_locale(data->start_class); - } - else { - assert(flags & SCF_DO_STCLASS_OR); - ssc_union(data->start_class, my_invlist, invert); - } - SvREFCNT_dec(my_invlist); - } - if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) { - data->flags |= (OP(scan) == MEOL - ? SF_BEFORE_MEOL - : SF_BEFORE_SEOL); - scan_commit(pRExC_state, data, minlenp, is_inf); - - } - else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ - /* Lookbehind, or need to calculate parens/evals/stclass: */ - && (scan->flags || data || (flags & SCF_DO_STCLASS)) - && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) - { - if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY - || OP(scan) == UNLESSM ) - { - /* Negative Lookahead/lookbehind - In this case we can't do fixed string optimisation. - */ - - bool is_positive = OP(scan) == IFMATCH ? 1 : 0; - SSize_t deltanext, minnext; - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - regnode *cur_last_close_op; - regnode *nscan; - regnode_ssc intrnl; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - - /* remember the last_close_op we saw so we can see if - * we are dealing with variable length lookbehind that - * contains capturing buffers, which are considered - * experimental */ - cur_last_close_op= *(data_fake.last_close_opp); - - data_fake.pos_delta = delta; - if ( flags & SCF_DO_STCLASS && !scan->flags - && OP(scan) == IFMATCH ) { /* Lookahead */ - ssc_init(pRExC_state, &intrnl); - data_fake.start_class = &intrnl; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - next = regnext(scan); - nscan = REGNODE_AFTER(scan); - - /* recurse study_chunk() for lookahead body */ - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, - mutate_ok); - - if (scan->flags) { - if ( deltanext < 0 - || deltanext > (I32) U8_MAX - || minnext > (I32)U8_MAX - || minnext + deltanext > (I32)U8_MAX) - { - FAIL2("Lookbehind longer than %" UVuf " not implemented", - (UV)U8_MAX); - } - - /* The 'next_off' field has been repurposed to count the - * additional starting positions to try beyond the initial - * one. (This leaves it at 0 for non-variable length - * matches to avoid breakage for those not using this - * extension) */ - if (deltanext) { - scan->next_off = deltanext; - if ( - /* See a CLOSE op inside this lookbehind? */ - cur_last_close_op != *(data_fake.last_close_opp) - /* and not doing restudy. see: restudied */ - && !(flags & SCF_TRIE_DOING_RESTUDY) - ) { - /* this is positive variable length lookbehind with - * capture buffers inside of it */ - ckWARNexperimental_with_arg(RExC_parse, - WARN_EXPERIMENTAL__VLB, - "Variable length %s lookbehind with capturing is experimental", - is_positive ? "positive" : "negative"); - } - } - scan->flags = (U8)minnext + deltanext; - } - if (data) { - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (f & SCF_DO_STCLASS_AND) { - if (flags & SCF_DO_STCLASS_OR) { - /* OR before, AND after: ideally we would recurse with - * data_fake to get the AND applied by study of the - * remainder of the pattern, and then derecurse; - * *** HACK *** for now just treat as "no information". - * See [perl #56690]. - */ - ssc_init(pRExC_state, data->start_class); - } else { - /* AND before and after: combine and continue. These - * assertions are zero-length, so can match an EMPTY - * string */ - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) - |= SSC_MATCHES_EMPTY_STRING; - } - } - DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta); - } -#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY - else { - /* Positive Lookahead/lookbehind - In this case we can do fixed string optimisation, - but we must be careful about it. Note in the case of - lookbehind the positions will be offset by the minimum - length of the pattern, something we won't know about - until after the recurse. - */ - SSize_t deltanext, fake_last_close = 0; - regnode *last_close_op = NULL; - regnode *nscan; - regnode_ssc intrnl; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - /* We use SAVEFREEPV so that when the full compile - is finished perl will clean up the allocated - minlens when it's all done. This way we don't - have to worry about freeing them when we know - they wont be used, which would be a pain. - */ - SSize_t *minnextp; - Newx( minnextp, 1, SSize_t ); - SAVEFREEPV(minnextp); - - if (data) { - StructCopy(data, &data_fake, scan_data_t); - if ((flags & SCF_DO_SUBSTR) && data->last_found) { - f |= SCF_DO_SUBSTR; - if (scan->flags) - scan_commit(pRExC_state, &data_fake, minlenp, is_inf); - data_fake.last_found=newSVsv(data->last_found); - } - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_opp; - } - data_fake.flags = 0; - data_fake.substrs[0].flags = 0; - data_fake.substrs[1].flags = 0; - data_fake.pos_delta = delta; - if (is_inf) - data_fake.flags |= SF_IS_INF; - if ( flags & SCF_DO_STCLASS && !scan->flags - && OP(scan) == IFMATCH ) { /* Lookahead */ - ssc_init(pRExC_state, &intrnl); - data_fake.start_class = &intrnl; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - next = regnext(scan); - nscan = REGNODE_AFTER(scan); - - /* positive lookahead study_chunk() recursion */ - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, - &deltanext, last, &data_fake, - stopparen, recursed_depth, NULL, - f, depth+1, mutate_ok); - if (scan->flags) { - assert(0); /* This code has never been tested since this - is normally not compiled */ - if ( deltanext < 0 - || deltanext > (I32) U8_MAX - || *minnextp > (I32)U8_MAX - || *minnextp + deltanext > (I32)U8_MAX) - { - FAIL2("Lookbehind longer than %" UVuf " not implemented", - (UV)U8_MAX); - } - - if (deltanext) { - scan->next_off = deltanext; - } - scan->flags = (U8)*minnextp + deltanext; - } - - *minnextp += min; - - if (f & SCF_DO_STCLASS_AND) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; - } - if (data) { - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { - int i; - if (RExC_rx->minlen < *minnextp) - RExC_rx->minlen = *minnextp; - scan_commit(pRExC_state, &data_fake, minnextp, is_inf); - SvREFCNT_dec_NN(data_fake.last_found); - - for (i = 0; i < 2; i++) { - if (data_fake.substrs[i].minlenp != minlenp) { - data->substrs[i].min_offset = - data_fake.substrs[i].min_offset; - data->substrs[i].max_offset = - data_fake.substrs[i].max_offset; - data->substrs[i].minlenp = - data_fake.substrs[i].minlenp; - data->substrs[i].lookbehind += scan->flags; - } - } - } - } - } -#endif - } - else if (OP(scan) == OPEN) { - if (stopparen != (I32)PARNO(scan)) - pars++; - } - else if (OP(scan) == CLOSE) { - if (stopparen == (I32)PARNO(scan)) { - break; - } - if ((I32)PARNO(scan) == is_par) { - next = regnext(scan); - - if ( next && (OP(next) != WHILEM) && next < last) - is_par = 0; /* Disable optimization */ - } - if (data) { - *(data->last_closep) = PARNO(scan); - *(data->last_close_opp) = scan; - } - } - else if (OP(scan) == EVAL) { - if (data) - data->flags |= SF_HAS_EVAL; - } - else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - if (OP(scan)==ACCEPT) { - /* m{(*ACCEPT)x} does not have to start with 'x' */ - flags &= ~SCF_DO_STCLASS; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - if (stopmin > min) - stopmin = min; - } - } - else if (OP(scan) == COMMIT) { - /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we - * must not end up with "abcxyz" as a fixed substring else we'll - * skip straight to attempting to match at offset 4. - */ - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - } - else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ - { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_anything(data->start_class); - flags &= ~SCF_DO_STCLASS; - } - else if (OP(scan) == GPOS) { - if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && - !(delta || is_inf || (data && data->pos_delta))) - { - if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->intflags |= PREGf_ANCH_GPOS; - if (RExC_rx->gofs < (STRLEN)min) - RExC_rx->gofs = min; - } else { - RExC_rx->intflags |= PREGf_GPOS_FLOAT; - RExC_rx->gofs = 0; - } - } -#ifdef TRIE_STUDY_OPT -#ifdef FULL_TRIE_STUDY - else if (REGNODE_TYPE(OP(scan)) == TRIE) { - /* NOTE - There is similar code to this block above for handling - BRANCH nodes on the initial study. If you change stuff here - check there too. */ - regnode *trie_node= scan; - regnode *tail= regnext(scan); - reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - SSize_t max1 = 0, min1 = OPTIMIZE_INFTY; - regnode_ssc accum; - - if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ - /* Cannot merge strings after this. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - } - if (flags & SCF_DO_STCLASS) - ssc_init_zero(pRExC_state, &accum); - - if (!trie->jump) { - min1= trie->minlen; - max1= trie->maxlen; - } else { - const regnode *nextbranch= NULL; - U32 word; - - for ( word=1 ; word <= trie->wordcount ; word++) - { - SSize_t deltanext = 0, minnext = 0; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - regnode_ssc this_class; - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - data_fake.pos_delta = delta; - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - data_fake.start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - - if (trie->jump[word]) { - if (!nextbranch) - nextbranch = trie_node + trie->jump[0]; - scan= trie_node + trie->jump[word]; - /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused - branches even though they arent otherwise used. */ - /* optimise study_chunk() for TRIE */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed_depth, NULL, f, depth+1, - mutate_ok); - } - if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) - nextbranch= regnext((regnode*)nextbranch); - - if (min1 > (SSize_t)(minnext + trie->minlen)) - min1 = minnext + trie->minlen; - if (deltanext == OPTIMIZE_INFTY) { - is_inf = is_inf_internal = 1; - max1 = OPTIMIZE_INFTY; - } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) - max1 = minnext + deltanext + trie->maxlen; - - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > min + min1) - stopmin = min + min1; - flags &= ~SCF_DO_SUBSTR; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - } - if (data) { - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); - } - DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta); - } - if (flags & SCF_DO_SUBSTR) { - data->pos_min += min1; - data->pos_delta += max1 - min1; - if (max1 != min1 || is_inf) - data->cur_is_floating = 1; /* float */ - } - min += min1; - if (delta != OPTIMIZE_INFTY) { - if (OPTIMIZE_INFTY - (max1 - min1) >= delta) - delta += max1 - min1; - else - delta = OPTIMIZE_INFTY; - } - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (flags & SCF_DO_STCLASS_AND) { - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); - flags &= ~SCF_DO_STCLASS; - } - else { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - } - } - scan= tail; - DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta); - continue; - } -#else - else if (REGNODE_TYPE(OP(scan)) == TRIE) { - reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - U8*bang=NULL; - - min += trie->minlen; - delta += (trie->maxlen - trie->minlen); - flags &= ~SCF_DO_STCLASS; /* xxx */ - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min += trie->minlen; - data->pos_delta += (trie->maxlen - trie->minlen); - if (trie->maxlen != trie->minlen) - data->cur_is_floating = 1; /* float */ - } - if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; - } - -#endif /* old or new */ -#endif /* TRIE_STUDY_OPT */ - - else if (OP(scan) == REGEX_SET) { - Perl_croak(aTHX_ "panic: %s regnode should be resolved" - " before optimization", REGNODE_NAME(REGEX_SET)); - } - - /* Else: zero-length, ignore. */ - scan = regnext(scan); - } - - finish: - if (frame) { - /* we need to unwind recursion. */ - depth = depth - 1; - - DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("fend", scan, depth, flags); - - /* restore previous context */ - last = frame->last_regnode; - scan = frame->next_regnode; - stopparen = frame->stopparen; - recursed_depth = frame->prev_recursed_depth; - - RExC_frame_last = frame->prev_frame; - frame = frame->this_prev_frame; - goto fake_study_recurse; - } - - assert(!frame); - DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta); - - /* is this pattern infinite? Eg, consider /(a|b+)/ */ - if (is_inf_internal) - delta = OPTIMIZE_INFTY; - - /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */ - if (min > stopmin) { - /* - At this point 'min' represents the minimum length string we can - match while *ignoring* the implication of ACCEPT, and 'delta' - represents the difference between the minimum length and maximum - length, and if the pattern matches an infinitely long string - (consider the + and * quantifiers) then we use the special delta - value of OPTIMIZE_INFTY to represent it. 'stopmin' is the - minimum length that can be matched *and* accepted. - - A pattern is accepted when matching was successful *and* - complete, and thus there is no further matching needing to be - done, no backtracking to occur, etc. Prior to the introduction - of ACCEPT the only opcode that signaled acceptance was the END - opcode, which is always the very last opcode in a regex program. - ACCEPT is thus conceptually an early successful return out of - the matching process. stopmin starts out as OPTIMIZE_INFTY to - represent "the entire pattern", and is ratched down to the - "current min" if necessary when an ACCEPT opcode is encountered. - - Thus stopmin might be smaller than min if we saw an (*ACCEPT), - and we now need to account for it in both min and delta. - Consider that in a pattern /AB/ normally the min length it can - match can be computed as min(A)+min(B). But (*ACCEPT) means - that it might be something else, not even neccesarily min(A) at - all. Consider - - A = /(foo(*ACCEPT)|x+)/ - B = /whop/ - AB = /(foo(*ACCEPT)|x+)whop/ - - The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY - for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for - "whop", and the delta of 0 as the pattern is of fixed length, the - stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT. - When handling AB we expect to see a min of 5 for "xwhop", and a - delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3 - for "foo". This should result in a final min of 3 for "foo", and - a final delta of OPTIMIZE_INFTY for "xxxxx...whop". - - In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a - min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the - stop min would be 4 for "dude". This should result in a final - min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx". - - When min is smaller than stopmin then we can ignore it. In the - fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2, - and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously - the ACCEPT doesn't reduce the minimum length of the string that - might be matched, nor affect the maximum length. - - In something like /foo(*ACCEPT)ba?r/ we would have a min of 5 - for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for - "foo". We currently turn this into a min of 3 for "foo" and a - delta of 3 for "foobar" even though technically "foobar" isn't - possible. ACCEPT affects some aspects of the optimizer, like - length computations and mandatory substring optimizations, but - there are other optimzations this routine perfoms that are not - affected and this compromise simplifies implementation. - - It might be helpful to consider that this C function is called - recursively on the pattern in a bottom up fashion, and that the - min returned by a nested call may be marked as coming from an - ACCEPT, causing its callers to treat the returned min as a - stopmin as the recursion unwinds. Thus a single ACCEPT can affect - multiple calls into this function in different ways. - */ - - if (OPTIMIZE_INFTY - delta >= min - stopmin) - delta += min - stopmin; - else - delta = OPTIMIZE_INFTY; - min = stopmin; - } - - *scanp = scan; - *deltap = delta; - - if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = OPTIMIZE_INFTY - data->pos_min; - if (is_par > (I32)U8_MAX) - is_par = 0; - if (is_par && pars==1 && data) { - data->flags |= SF_IN_PAR; - data->flags &= ~SF_HAS_PAR; - } - else if (pars && data) { - data->flags |= SF_HAS_PAR; - data->flags &= ~SF_IN_PAR; - } - if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - if (flags & SCF_TRIE_RESTUDY) - data->flags |= SCF_TRIE_RESTUDY; - - - if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { - if (min > OPTIMIZE_INFTY - delta) - RExC_maxlen = OPTIMIZE_INFTY; - else if (RExC_maxlen < min + delta) - RExC_maxlen = min + delta; - } - DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta); - return min; -} - /* add a data member to the struct reg_data attached to this regex, it should * always return a non-zero return. the 's' argument is the type of the items * being added and the n is the number of items. The length of 's' should match * the number of items. */ -STATIC U32 -S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +U32 +Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1; - PERL_ARGS_ASSERT_ADD_DATA; + PERL_ARGS_ASSERT_REG_ADD_DATA; /* in the below expression we have (count + n - 1), the minus one is there * because the struct that we allocate already contains a slot for 1 data @@ -6958,9 +320,9 @@ S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) /* when count == 1 it means we have not initialized anything. * we always fill the 0 slot of the data array with a '%' entry, which * means "zero" (all the other types are letters) which exists purely - * so the return from add_data is ALWAYS true, so we can tell it apart + * so the return from reg_add_data is ALWAYS true, so we can tell it apart * from a "no value" idx=0 in places where we would return an index - * into add_data. This is particularly important with the new "single + * into reg_add_data. This is particularly important with the new "single * pass, usually, but not always" strategy that we use, where the code * will use a 0 to represent "not able to compute this yet". */ @@ -7435,9 +797,9 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, assert(n < pRExC_state->code_blocks->count); src = &ri->code_blocks->cb[i]; dst = &pRExC_state->code_blocks->cb[n]; - dst->start = src->start + offset; - dst->end = src->end + offset; - dst->block = src->block; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) src->src_regex ? src->src_regex @@ -7689,9 +1051,9 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, assert(pat[src->start] == '('); assert(pat[src->end] == ')'); - dst->start = src->start; - dst->end = src->end; - dst->block = src->block; + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) : src->src_regex; dst++; @@ -8236,7 +1598,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_rx->intflags = 0; - RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ RExC_parse_set(exp); /* This NUL is guaranteed because the pattern comes from an SV*, and the sv @@ -8423,11 +1785,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) - SvUTF8_on(Rx); /* Unicode in it? */ + SvUTF8_on(Rx); /* Unicode in it? */ RExC_rxi->regstclass = NULL; - if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ + if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ RExC_rx->intflags |= PREGf_NAUGHTY; - scan = RExC_rxi->program + 1; /* First BRANCH. */ + scan = RExC_rxi->program + 1; /* First BRANCH. */ /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ @@ -8487,7 +1849,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Ignore EXACT as we deal with it later. */ if (REGNODE_TYPE(OP(first)) == EXACT) { if (! isEXACTFish(OP(first))) { - NOOP; /* Empty, get anchored substr later. */ + NOOP; /* Empty, get anchored substr later. */ } else RExC_rxi->regstclass = first; @@ -8581,7 +1943,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; - } else /* XXXX Check for BOUND? */ + } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; data.last_close_opp = &last_close_op; @@ -8661,7 +2023,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && is_ssc_worth_it(pRExC_state, data.start_class)) { - const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); ssc_finalize(pRExC_state, data.start_class); @@ -8670,7 +2032,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, (regnode_ssc*)RExC_rxi->data->data[n], regnode_ssc); RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; - RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); Perl_re_printf( aTHX_ @@ -8745,7 +2107,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && is_ssc_worth_it(pRExC_state, data.start_class)) { - const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); ssc_finalize(pRExC_state, data.start_class); @@ -8754,7 +2116,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, (regnode_ssc*)RExC_rxi->data->data[n], regnode_ssc); RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; - RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); Perl_re_printf( aTHX_ @@ -8860,7 +2222,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #ifdef DEBUGGING if (RExC_paren_names) { - RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a")); RExC_rxi->data->data[RExC_rxi->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else @@ -8916,426 +2278,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } -SV* -Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, - const U32 flags) -{ - PERL_ARGS_ASSERT_REG_NAMED_BUFF; - - PERL_UNUSED_ARG(value); - - if (flags & RXapif_FETCH) { - return reg_named_buff_fetch(rx, key, flags); - } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak_no_modify(); - return NULL; - } else if (flags & RXapif_EXISTS) { - return reg_named_buff_exists(rx, key, flags) - ? &PL_sv_yes - : &PL_sv_no; - } else if (flags & RXapif_REGNAMES) { - return reg_named_buff_all(rx, flags); - } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { - return reg_named_buff_scalar(rx, flags); - } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); - return NULL; - } -} - -SV* -Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, - const U32 flags) -{ - PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; - PERL_UNUSED_ARG(lastkey); - - if (flags & RXapif_FIRSTKEY) - return reg_named_buff_firstkey(rx, flags); - else if (flags & RXapif_NEXTKEY) - return reg_named_buff_nextkey(rx, flags); - else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", - (int)flags); - return NULL; - } -} - -SV* -Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, - const U32 flags) -{ - SV *ret; - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; - - if (rx && RXp_PAREN_NAMES(rx)) { - HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL; - for ( i=0; i<SvIVX(sv_dat); i++ ) { - if ((I32)(rx->nparens) >= nums[i] - && rx->offs[nums[i]].start != -1 - && rx->offs[nums[i]].end != -1) - { - ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(r, nums[i], ret); - if (!retarray) - return ret; - } else { - if (retarray) - ret = newSV_type(SVt_NULL); - } - if (retarray) - av_push_simple(retarray, ret); - } - if (retarray) - return newRV_noinc(MUTABLE_SV(retarray)); - } - } - return NULL; -} - -bool -Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, - const U32 flags) -{ - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; - - if (rx && RXp_PAREN_NAMES(rx)) { - if (flags & RXapif_ALL) { - return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); - } else { - SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); - if (sv) { - SvREFCNT_dec_NN(sv); - return TRUE; - } else { - return FALSE; - } - } - } else { - return FALSE; - } -} - -SV* -Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; - - if ( rx && RXp_PAREN_NAMES(rx) ) { - (void)hv_iterinit(RXp_PAREN_NAMES(rx)); - - return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); - } else { - return FALSE; - } -} - -SV* -Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; - - if (rx && RXp_PAREN_NAMES(rx)) { - HV *hv = RXp_PAREN_NAMES(rx); - HE *temphe; - while ( (temphe = hv_iternext_flags(hv, 0)) ) { - IV i; - IV parno = 0; - SV* sv_dat = HeVAL(temphe); - I32 *nums = (I32*)SvPVX(sv_dat); - for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastparen) >= nums[i] && - rx->offs[nums[i]].start != -1 && - rx->offs[nums[i]].end != -1) - { - parno = nums[i]; - break; - } - } - if (parno || flags & RXapif_ALL) { - return newSVhek(HeKEY_hek(temphe)); - } - } - } - return NULL; -} - -SV* -Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) -{ - SV *ret; - AV *av; - SSize_t length; - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; - - if (rx && RXp_PAREN_NAMES(rx)) { - if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { - return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); - } else if (flags & RXapif_ONE) { - ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); - av = MUTABLE_AV(SvRV(ret)); - length = av_count(av); - SvREFCNT_dec_NN(ret); - return newSViv(length); - } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", - (int)flags); - return NULL; - } - } - return &PL_sv_undef; -} - -SV* -Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - AV *av = newAV(); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; - - if (rx && RXp_PAREN_NAMES(rx)) { - HV *hv= RXp_PAREN_NAMES(rx); - HE *temphe; - (void)hv_iterinit(hv); - while ( (temphe = hv_iternext_flags(hv, 0)) ) { - IV i; - IV parno = 0; - SV* sv_dat = HeVAL(temphe); - I32 *nums = (I32*)SvPVX(sv_dat); - for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastparen) >= nums[i] && - rx->offs[nums[i]].start != -1 && - rx->offs[nums[i]].end != -1) - { - parno = nums[i]; - break; - } - } - if (parno || flags & RXapif_ALL) { - av_push_simple(av, newSVhek(HeKEY_hek(temphe))); - } - } - } - - return newRV_noinc(MUTABLE_SV(av)); -} - -void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, - SV * const sv) -{ - struct regexp *const rx = ReANY(r); - char *s = NULL; - SSize_t i = 0; - SSize_t s1, t1; - I32 n = paren; - - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - - if ( n == RX_BUFF_IDX_CARET_PREMATCH - || n == RX_BUFF_IDX_CARET_FULLMATCH - || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - { - bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); - if (!keepcopy) { - /* on something like - * $r = qr/.../; - * /$qr/p; - * the KEEPCOPY is set on the PMOP rather than the regex */ - if (PL_curpm && r == PM_GETRE(PL_curpm)) - keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); - } - if (!keepcopy) - goto ret_undef; - } - - if (!rx->subbeg) - goto ret_undef; - - if (n == RX_BUFF_IDX_CARET_FULLMATCH) - /* no need to distinguish between them any more */ - n = RX_BUFF_IDX_FULLMATCH; - - if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) - && rx->offs[0].start != -1) - { - /* $`, ${^PREMATCH} */ - i = rx->offs[0].start; - s = rx->subbeg; - } - else - if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) - && rx->offs[0].end != -1) - { - /* $', ${^POSTMATCH} */ - s = rx->subbeg - rx->suboffset + rx->offs[0].end; - i = rx->sublen + rx->suboffset - rx->offs[0].end; - } - else - if (inRANGE(n, 0, (I32)rx->nparens) && - (s1 = rx->offs[n].start) != -1 && - (t1 = rx->offs[n].end) != -1) - { - /* $&, ${^MATCH}, $1 ... */ - i = t1 - s1; - s = rx->subbeg + s1 - rx->suboffset; - } else { - goto ret_undef; - } - - assert(s >= rx->subbeg); - assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); - if (i >= 0) { -#ifdef NO_TAINT_SUPPORT - sv_setpvn(sv, s, i); -#else - const int oldtainted = TAINT_get; - TAINT_NOT; - sv_setpvn(sv, s, i); - TAINT_set(oldtainted); -#endif - if (RXp_MATCH_UTF8(rx)) - SvUTF8_on(sv); - else - SvUTF8_off(sv); - if (TAINTING_get) { - if (RXp_MATCH_TAINTED(rx)) { - if (SvTYPE(sv) >= SVt_PVMG) { - MAGIC* const mg = SvMAGIC(sv); - MAGIC* mgt; - TAINT; - SvMAGIC_set(sv, mg->mg_moremagic); - SvTAINT(sv); - if ((mgt = SvMAGIC(sv))) { - mg->mg_moremagic = mgt; - SvMAGIC_set(sv, mg); - } - } else { - TAINT; - SvTAINT(sv); - } - } else - SvTAINTED_off(sv); - } - } else { - ret_undef: - sv_set_undef(sv); - return; - } -} - -void -Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, - SV const * const value) -{ - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; - - PERL_UNUSED_ARG(rx); - PERL_UNUSED_ARG(paren); - PERL_UNUSED_ARG(value); - - if (!PL_localizing) - Perl_croak_no_modify(); -} - -I32 -Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, - const I32 paren) -{ - struct regexp *const rx = ReANY(r); - I32 i; - I32 s1, t1; - - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; - - if ( paren == RX_BUFF_IDX_CARET_PREMATCH - || paren == RX_BUFF_IDX_CARET_FULLMATCH - || paren == RX_BUFF_IDX_CARET_POSTMATCH - ) - { - bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); - if (!keepcopy) { - /* on something like - * $r = qr/.../; - * /$qr/p; - * the KEEPCOPY is set on the PMOP rather than the regex */ - if (PL_curpm && r == PM_GETRE(PL_curpm)) - keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); - } - if (!keepcopy) - goto warn_undef; - } - - /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ - switch (paren) { - case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - case RX_BUFF_IDX_PREMATCH: /* $` */ - if (rx->offs[0].start != -1) { - i = rx->offs[0].start; - if (i > 0) { - s1 = 0; - t1 = i; - goto getlen; - } - } - return 0; - - case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - case RX_BUFF_IDX_POSTMATCH: /* $' */ - if (rx->offs[0].end != -1) { - i = rx->sublen - rx->offs[0].end; - if (i > 0) { - s1 = rx->offs[0].end; - t1 = rx->sublen; - goto getlen; - } - } - return 0; - - default: /* $& / ${^MATCH}, $1, $2, ... */ - if (paren <= (I32)rx->nparens && - (s1 = rx->offs[paren].start) != -1 && - (t1 = rx->offs[paren].end) != -1) - { - i = t1 - s1; - goto getlen; - } else { - warn_undef: - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit((const SV *)sv); - return 0; - } - } - getlen: - if (i > 0 && RXp_MATCH_UTF8(rx)) { - const char * const s = rx->subbeg - rx->suboffset + s1; - const U8 *ep; - STRLEN el; - - i = t1 - s1; - if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) - i = el; - } - return i; -} SV* Perl_reg_qr_package(pTHX_ REGEXP * const rx) @@ -9458,1730 +2400,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_re_printf( aTHX_ fmt "\n",args); \ }) -/* This section of code defines the inversion list object and its methods. The - * interfaces are highly subject to change, so as much as possible is static to - * this file. An inversion list is here implemented as a malloc'd C UV array - * as an SVt_INVLIST scalar. - * - * An inversion list for Unicode is an array of code points, sorted by ordinal - * number. Each element gives the code point that begins a range that extends - * up-to but not including the code point given by the next element. The final - * element gives the first code point of a range that extends to the platform's - * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], - * ...) give ranges whose code points are all in the inversion list. We say - * that those ranges are in the set. The odd-numbered elements give ranges - * whose code points are not in the inversion list, and hence not in the set. - * Thus, element [0] is the first code point in the list. Element [1] - * is the first code point beyond that not in the list; and element [2] is the - * first code point beyond that that is in the list. In other words, the first - * range is invlist[0]..(invlist[1]-1), and all code points in that range are - * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and - * all code points in that range are not in the inversion list. The third - * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion - * list, and so forth. Thus every element whose index is divisible by two - * gives the beginning of a range that is in the list, and every element whose - * index is not divisible by two gives the beginning of a range not in the - * list. If the final element's index is divisible by two, the inversion list - * extends to the platform's infinity; otherwise the highest code point in the - * inversion list is the contents of that element minus 1. - * - * A range that contains just a single code point N will look like - * invlist[i] == N - * invlist[i+1] == N+1 - * - * If N is UV_MAX (the highest representable code point on the machine), N+1 is - * impossible to represent, so element [i+1] is omitted. The single element - * inversion list - * invlist[0] == UV_MAX - * contains just UV_MAX, but is interpreted as matching to infinity. - * - * Taking the complement (inverting) an inversion list is quite simple, if the - * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element at the beginning of each inversion - * list to always contain 0; there is an additional flag in the header which - * indicates if the list begins at the 0, or is offset to begin at the next - * element. This means that the inversion list can be inverted without any - * copying; just flip the flag. - * - * More about inversion lists can be found in "Unicode Demystified" - * Chapter 13 by Richard Gillam, published by Addison-Wesley. - * - * The inversion list data structure is currently implemented as an SV pointing - * to an array of UVs that the SV thinks are bytes. This allows us to have an - * array of UV whose memory management is automatically handled by the existing - * facilities for SV's. - * - * Some of the methods should always be private to the implementation, and some - * should eventually be made public */ - -/* The header definitions are in F<invlist_inline.h> */ - -#ifndef PERL_IN_XSUB_RE - -PERL_STATIC_INLINE UV* -S__invlist_array_init(SV* const invlist, const bool will_have_0) -{ - /* Returns a pointer to the first element in the inversion list's array. - * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 in it - * or not. The other parameter tells it whether the code that follows this - * call is about to put a 0 in the inversion list or not. The first - * element is either the element reserved for 0, if TRUE, or the element - * after it, if FALSE */ - - bool* offset = get_invlist_offset_addr(invlist); - UV* zero_addr = (UV *) SvPVX(invlist); - - PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; - - /* Must be empty */ - assert(! _invlist_len(invlist)); - - *zero_addr = 0; - - /* 1^1 = 0; 1^0 = 1 */ - *offset = 1 ^ will_have_0; - return zero_addr + *offset; -} - -STATIC void -S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) -{ - /* Replaces the inversion list in 'dest' with the one from 'src'. It - * steals the list from 'src', so 'src' is made to have a NULL list. This - * is similar to what SvSetMagicSV() would do, if it were implemented on - * inversion lists, though this routine avoids a copy */ - - const UV src_len = _invlist_len(src); - const bool src_offset = *get_invlist_offset_addr(src); - const STRLEN src_byte_len = SvLEN(src); - char * array = SvPVX(src); - -#ifndef NO_TAINT_SUPPORT - const int oldtainted = TAINT_get; -#endif - - PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; - - assert(is_invlist(src)); - assert(is_invlist(dest)); - assert(! invlist_is_iterating(src)); - assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); - - /* Make sure it ends in the right place with a NUL, as our inversion list - * manipulations aren't careful to keep this true, but sv_usepvn_flags() - * asserts it */ - array[src_byte_len - 1] = '\0'; - - TAINT_NOT; /* Otherwise it breaks */ - sv_usepvn_flags(dest, - (char *) array, - src_byte_len - 1, - - /* This flag is documented to cause a copy to be avoided */ - SV_HAS_TRAILING_NUL); - TAINT_set(oldtainted); - SvPV_set(src, 0); - SvLEN_set(src, 0); - SvCUR_set(src, 0); - - /* Finish up copying over the other fields in an inversion list */ - *get_invlist_offset_addr(dest) = src_offset; - invlist_set_len(dest, src_len, src_offset); - *get_invlist_previous_index_addr(dest) = 0; - invlist_iterfinish(dest); -} - -PERL_STATIC_INLINE IV* -S_get_invlist_previous_index_addr(SV* invlist) -{ - /* Return the address of the IV that is reserved to hold the cached index - * */ - PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - - assert(is_invlist(invlist)); - - return &(((XINVLIST*) SvANY(invlist))->prev_index); -} - -PERL_STATIC_INLINE IV -S_invlist_previous_index(SV* const invlist) -{ - /* Returns cached index of previous search */ - - PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; - - return *get_invlist_previous_index_addr(invlist); -} - -PERL_STATIC_INLINE void -S_invlist_set_previous_index(SV* const invlist, const IV index) -{ - /* Caches <index> for later retrieval */ - - PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; - - assert(index == 0 || index < (int) _invlist_len(invlist)); - - *get_invlist_previous_index_addr(invlist) = index; -} - -PERL_STATIC_INLINE void -S_invlist_trim(SV* invlist) -{ - /* Free the not currently-being-used space in an inversion list */ - - /* But don't free up the space needed for the 0 UV that is always at the - * beginning of the list, nor the trailing NUL */ - const UV min_size = TO_INTERNAL_SIZE(1) + 1; - - PERL_ARGS_ASSERT_INVLIST_TRIM; - - assert(is_invlist(invlist)); - - SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); -} - -PERL_STATIC_INLINE void -S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ -{ - PERL_ARGS_ASSERT_INVLIST_CLEAR; - - assert(is_invlist(invlist)); - - invlist_set_len(invlist, 0, 0); - invlist_trim(invlist); -} - -#endif /* ifndef PERL_IN_XSUB_RE */ - -PERL_STATIC_INLINE bool -S_invlist_is_iterating(const SV* const invlist) -{ - PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - - /* get_invlist_iter_addr()'s sv is non-const only because it returns a - * value that can be used to modify the invlist, it doesn't modify the - * invlist itself */ - return *(get_invlist_iter_addr((SV*)invlist)) < (STRLEN) UV_MAX; -} - -#ifndef PERL_IN_XSUB_RE - -PERL_STATIC_INLINE UV -S_invlist_max(const SV* const invlist) -{ - /* Returns the maximum number of elements storable in the inversion list's - * array, without having to realloc() */ - - PERL_ARGS_ASSERT_INVLIST_MAX; - - assert(is_invlist(invlist)); - - /* Assumes worst case, in which the 0 element is not counted in the - * inversion list, so subtracts 1 for that */ - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 - : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; -} - -STATIC void -S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) -{ - PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS; - - /* First 1 is in case the zero element isn't in the list; second 1 is for - * trailing NUL */ - SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1); - invlist_set_len(invlist, 0, 0); - - /* Force iterinit() to be used to get iteration to work */ - invlist_iterfinish(invlist); - - *get_invlist_previous_index_addr(invlist) = 0; - SvPOK_on(invlist); /* This allows B to extract the PV */ -} - -SV* -Perl__new_invlist(pTHX_ IV initial_size) -{ - - /* Return a pointer to a newly constructed inversion list, with enough - * space to store 'initial_size' elements. If that number is negative, a - * system default is used instead */ - - SV* new_list; - - if (initial_size < 0) { - initial_size = 10; - } - - new_list = newSV_type(SVt_INVLIST); - initialize_invlist_guts(new_list, initial_size); - - return new_list; -} - -SV* -Perl__new_invlist_C_array(pTHX_ const UV* const list) -{ - /* Return a pointer to a newly constructed inversion list, initialized to - * point to <list>, which has to be in the exact correct inversion list - * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands. The passed in 'list' contains - * several header fields at the beginning that are not part of the - * inversion list body proper */ - - const STRLEN length = (STRLEN) list[0]; - const UV version_id = list[1]; - const bool offset = cBOOL(list[2]); -#define HEADER_LENGTH 3 - /* If any of the above changes in any way, you must change HEADER_LENGTH - * (if appropriate) and regenerate INVLIST_VERSION_ID by running - * perl -E 'say int(rand 2**31-1)' - */ -#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and - data structure type, so that one being - passed in can be validated to be an - inversion list of the correct vintage. - */ - - SV* invlist = newSV_type(SVt_INVLIST); - - PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - - if (version_id != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } - - /* The generated array passed in includes header elements that aren't part - * of the list proper, so start it just after them */ - SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); - - SvLEN_set(invlist, 0); /* Means we own the contents, and the system - shouldn't touch it */ - - *(get_invlist_offset_addr(invlist)) = offset; - - /* The 'length' passed to us is the physical number of elements in the - * inversion list. But if there is an offset the logical number is one - * less than that */ - invlist_set_len(invlist, length - offset, offset); - - invlist_set_previous_index(invlist, 0); - - /* Initialize the iteration pointer. */ - invlist_iterfinish(invlist); - - SvREADONLY_on(invlist); - SvPOK_on(invlist); - - return invlist; -} - -STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, - const UV start, const UV end) -{ - /* Subject to change or removal. Append the range from 'start' to 'end' at - * the end of the inversion list. The range must be above any existing - * ones. */ - - UV* array; - UV max = invlist_max(invlist); - UV len = _invlist_len(invlist); - bool offset; - - PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; - - if (len == 0) { /* Empty lists must be initialized */ - offset = start != 0; - array = _invlist_array_init(invlist, ! offset); - } - else { - /* Here, the existing list is non-empty. The current max entry in the - * list is generally the first value not in the set, except when the - * set extends to the end of permissible values, in which case it is - * the first entry in that final set, and so this call is an attempt to - * append out-of-order */ - - UV final_element = len - 1; - array = invlist_array(invlist); - if ( array[final_element] > start - || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) - { - Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", - array[final_element], start, - ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); - } - - /* Here, it is a legal append. If the new range begins 1 above the end - * of the range below it, it is extending the range below it, so the - * new first value not in the set is one greater than the newly - * extended range. */ - offset = *get_invlist_offset_addr(invlist); - if (array[final_element] == start) { - if (end != UV_MAX) { - array[final_element] = end + 1; - } - else { - /* But if the end is the maximum representable on the machine, - * assume that infinity was actually what was meant. Just let - * the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1, offset); - } - return; - } - } - - /* Here the new range doesn't extend any existing set. Add it */ - - len += 2; /* Includes an element each for the start and end of range */ - - /* If wll overflow the existing space, extend, which may cause the array to - * be moved */ - if (max < len) { - invlist_extend(invlist, len); - - /* Have to set len here to avoid assert failure in invlist_array() */ - invlist_set_len(invlist, len, offset); - - array = invlist_array(invlist); - } - else { - invlist_set_len(invlist, len, offset); - } - - /* The next item on the list starts the range, the one after that is - * one past the new range. */ - array[len - 2] = start; - if (end != UV_MAX) { - array[len - 1] = end + 1; - } - else { - /* But if the end is the maximum representable on the machine, just let - * the range have no end */ - invlist_set_len(invlist, len - 1, offset); - } -} - -SSize_t -Perl__invlist_search(SV* const invlist, const UV cp) -{ - /* Searches the inversion list for the entry that contains the input code - * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the - * return value is the index into the list's array of the range that - * contains <cp>, that is, 'i' such that - * array[i] <= cp < array[i+1] - */ - - IV low = 0; - IV mid; - IV high = _invlist_len(invlist); - const IV highest_element = high - 1; - const UV* array; - - PERL_ARGS_ASSERT__INVLIST_SEARCH; - - /* If list is empty, return failure. */ - if (UNLIKELY(high == 0)) { - return -1; - } - - /* (We can't get the array unless we know the list is non-empty) */ - array = invlist_array(invlist); - - mid = invlist_previous_index(invlist); - assert(mid >=0); - if (UNLIKELY(mid > highest_element)) { - mid = highest_element; - } - - /* <mid> contains the cache of the result of the previous call to this - * function (0 the first time). See if this call is for the same result, - * or if it is for mid-1. This is under the theory that calls to this - * function will often be for related code points that are near each other. - * And benchmarks show that caching gives better results. We also test - * here if the code point is within the bounds of the list. These tests - * replace others that would have had to be made anyway to make sure that - * the array bounds were not exceeded, and these give us extra information - * at the same time */ - if (cp >= array[mid]) { - if (cp >= array[highest_element]) { - return highest_element; - } - - /* Here, array[mid] <= cp < array[highest_element]. This means that - * the final element is not the answer, so can exclude it; it also - * means that <mid> is not the final element, so can refer to 'mid + 1' - * safely */ - if (cp < array[mid + 1]) { - return mid; - } - high--; - low = mid + 1; - } - else { /* cp < aray[mid] */ - if (cp < array[0]) { /* Fail if outside the array */ - return -1; - } - high = mid; - if (cp >= array[mid - 1]) { - goto found_entry; - } - } - - /* Binary search. What we are looking for is <i> such that - * array[i] <= cp < array[i+1] - * The loop below converges on the i+1. Note that there may not be an - * (i+1)th element in the array, and things work nonetheless */ - while (low < high) { - mid = (low + high) / 2; - assert(mid <= highest_element); - if (array[mid] <= cp) { /* cp >= array[mid] */ - low = mid + 1; - - /* We could do this extra test to exit the loop early. - if (cp < array[low]) { - return mid; - } - */ - } - else { /* cp < array[mid] */ - high = mid; - } - } - - found_entry: - high--; - invlist_set_previous_index(invlist, high); - return high; -} - -void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, - const bool complement_b, SV** output) -{ - /* Take the union of two inversion lists and point '*output' to it. On - * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly - * even 'a' or 'b'). If to an inversion list, the contents of the original - * list will be replaced by the union. The first list, 'a', may be - * NULL, in which case a copy of the second list is placed in '*output'. - * If 'complement_b' is TRUE, the union is taken of the complement - * (inversion) of 'b' instead of b itself. - * - * The basis for this comes from "Unicode Demystified" Chapter 13 by - * Richard Gillam, published by Addison-Wesley, and explained at some - * length there. The preface says to incorporate its examples into your - * code at your own risk. - * - * The algorithm is like a merge sort. */ - - const UV* array_a; /* a's array */ - const UV* array_b; - UV len_a; /* length of a's array */ - UV len_b; - - SV* u; /* the resulting union */ - UV* array_u; - UV len_u = 0; - - UV i_a = 0; /* current index into a's array */ - UV i_b = 0; - UV i_u = 0; - - /* running count, as explained in the algorithm source book; items are - * stopped accumulating and are output when the count changes to/from 0. - * The count is incremented when we start a range that's in an input's set, - * and decremented when we start a range that's not in a set. So this - * variable can be 0, 1, or 2. When it is 0 neither input is in their set, - * and hence nothing goes into the union; 1, just one of the inputs is in - * its set (and its current range gets added to the union); and 2 when both - * inputs are in their sets. */ - UV count = 0; - - PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; - assert(a != b); - assert(*output == NULL || is_invlist(*output)); - - len_b = _invlist_len(b); - if (len_b == 0) { - - /* Here, 'b' is empty, hence it's complement is all possible code - * points. So if the union includes the complement of 'b', it includes - * everything, and we need not even look at 'a'. It's easiest to - * create a new inversion list that matches everything. */ - if (complement_b) { - SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); - - if (*output == NULL) { /* If the output didn't exist, just point it - at the new list */ - *output = everything; - } - else { /* Otherwise, replace its contents with the new list */ - invlist_replace_list_destroys_src(*output, everything); - SvREFCNT_dec_NN(everything); - } - - return; - } - - /* Here, we don't want the complement of 'b', and since 'b' is empty, - * the union will come entirely from 'a'. If 'a' is NULL or empty, the - * output will be empty */ - - if (a == NULL || _invlist_len(a) == 0) { - if (*output == NULL) { - *output = _new_invlist(0); - } - else { - invlist_clear(*output); - } - return; - } - - /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the - * union. We can just return a copy of 'a' if '*output' doesn't point - * to an existing list */ - if (*output == NULL) { - *output = invlist_clone(a, NULL); - return; - } - - /* If the output is to overwrite 'a', we have a no-op, as it's - * already in 'a' */ - if (*output == a) { - return; - } - - /* Here, '*output' is to be overwritten by 'a' */ - u = invlist_clone(a, NULL); - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - - return; - } - - /* Here 'b' is not empty. See about 'a' */ - - if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { - - /* Here, 'a' is empty (and b is not). That means the union will come - * entirely from 'b'. If '*output' is NULL, we can directly return a - * clone of 'b'. Otherwise, we replace the contents of '*output' with - * the clone */ - - SV ** dest = (*output == NULL) ? output : &u; - *dest = invlist_clone(b, NULL); - if (complement_b) { - _invlist_invert(*dest); - } - - if (dest == &u) { - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - } - - return; - } - - /* Here both lists exist and are non-empty */ - array_a = invlist_array(a); - array_b = invlist_array(b); - - /* If are to take the union of 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later */ - if (array_b[0] == 0) { - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - /* Size the union for the worst case: that the sets are completely - * disjoint */ - u = _new_invlist(len_a + len_b); - - /* Will contain U+0000 if either component does */ - array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) - || (len_b > 0 && array_b[0] == 0)); - - /* Go through each input list item by item, stopping when have exhausted - * one of them */ - while (i_a < len_a && i_b < len_b) { - UV cp; /* The element to potentially add to the union's array */ - bool cp_in_set; /* is it in the input list's set or not */ - - /* We need to take one or the other of the two inputs for the union. - * Since we are merging two sorted lists, we take the smaller of the - * next items. In case of a tie, we take first the one that is in its - * set. If we first took the one not in its set, it would decrement - * the count, possibly to 0 which would cause it to be output as ending - * the range, and the next time through we would take the same number, - * and output it again as beginning the next range. By doing it the - * opposite way, there is no possibility that the count will be - * momentarily decremented to 0, and thus the two adjoining ranges will - * be seamlessly merged. (In a tie and both are in the set or both not - * in the set, it doesn't matter which we take first.) */ - if ( array_a[i_a] < array_b[i_b] - || ( array_a[i_a] == array_b[i_b] - && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) - { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp = array_a[i_a++]; - } - else { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); - cp = array_b[i_b++]; - } - - /* Here, have chosen which of the two inputs to look at. Only output - * if the running count changes to/from 0, which marks the - * beginning/end of a range that's in the set */ - if (cp_in_set) { - if (count == 0) { - array_u[i_u++] = cp; - } - count++; - } - else { - count--; - if (count == 0) { - array_u[i_u++] = cp; - } - } - } - - - /* The loop above increments the index into exactly one of the input lists - * each iteration, and ends when either index gets to its list end. That - * means the other index is lower than its end, and so something is - * remaining in that one. We decrement 'count', as explained below, if - * that list is in its set. (i_a and i_b each currently index the element - * beyond the one we care about.) */ - if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) - { - count--; - } - - /* Above we decremented 'count' if the list that had unexamined elements in - * it was in its set. This has made it so that 'count' being non-zero - * means there isn't anything left to output; and 'count' equal to 0 means - * that what is left to output is precisely that which is left in the - * non-exhausted input list. - * - * To see why, note first that the exhausted input obviously has nothing - * left to add to the union. If it was in its set at its end, that means - * the set extends from here to the platform's infinity, and hence so does - * the union and the non-exhausted set is irrelevant. The exhausted set - * also contributed 1 to 'count'. If 'count' was 2, it got decremented to - * 1, but if it was 1, the non-exhausted set wasn't in its set, and so - * 'count' remains at 1. This is consistent with the decremented 'count' - * != 0 meaning there's nothing left to add to the union. - * - * But if the exhausted input wasn't in its set, it contributed 0 to - * 'count', and the rest of the union will be whatever the other input is. - * If 'count' was 0, neither list was in its set, and 'count' remains 0; - * otherwise it gets decremented to 0. This is consistent with 'count' - * == 0 meaning the remainder of the union is whatever is left in the - * non-exhausted list. */ - if (count != 0) { - len_u = i_u; - } - else { - IV copy_count = len_a - i_a; - if (copy_count > 0) { /* The non-exhausted input is 'a' */ - Copy(array_a + i_a, array_u + i_u, copy_count, UV); - } - else { /* The non-exhausted input is b */ - copy_count = len_b - i_b; - Copy(array_b + i_b, array_u + i_u, copy_count, UV); - } - len_u = i_u + copy_count; - } - - /* Set the result to the final length, which can change the pointer to - * array_u, so re-find it. (Note that it is unlikely that this will - * change, as we are shrinking the space, not enlarging it) */ - if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); - invlist_trim(u); - array_u = invlist_array(u); - } - - if (*output == NULL) { /* Simply return the new inversion list */ - *output = u; - } - else { - /* Otherwise, overwrite the inversion list that was in '*output'. We - * could instead free '*output', and then set it to 'u', but experience - * has shown [perl #127392] that if the input is a mortal, we can get a - * huge build-up of these during regex compilation before they get - * freed. */ - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - } - - return; -} - -void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, - const bool complement_b, SV** i) -{ - /* Take the intersection of two inversion lists and point '*i' to it. On - * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly - * even 'a' or 'b'). If to an inversion list, the contents of the original - * list will be replaced by the intersection. The first list, 'a', may be - * NULL, in which case '*i' will be an empty list. If 'complement_b' is - * TRUE, the result will be the intersection of 'a' and the complement (or - * inversion) of 'b' instead of 'b' directly. - * - * The basis for this comes from "Unicode Demystified" Chapter 13 by - * Richard Gillam, published by Addison-Wesley, and explained at some - * length there. The preface says to incorporate its examples into your - * code at your own risk. In fact, it had bugs - * - * The algorithm is like a merge sort, and is essentially the same as the - * union above - */ - - const UV* array_a; /* a's array */ - const UV* array_b; - UV len_a; /* length of a's array */ - UV len_b; - - SV* r; /* the resulting intersection */ - UV* array_r; - UV len_r = 0; - - UV i_a = 0; /* current index into a's array */ - UV i_b = 0; - UV i_r = 0; - - /* running count of how many of the two inputs are postitioned at ranges - * that are in their sets. As explained in the algorithm source book, - * items are stopped accumulating and are output when the count changes - * to/from 2. The count is incremented when we start a range that's in an - * input's set, and decremented when we start a range that's not in a set. - * Only when it is 2 are we in the intersection. */ - UV count = 0; - - PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; - assert(a != b); - assert(*i == NULL || is_invlist(*i)); - - /* Special case if either one is empty */ - len_a = (a == NULL) ? 0 : _invlist_len(a); - if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { - if (len_a != 0 && complement_b) { - - /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' - * must be empty. Here, also we are using 'b's complement, which - * hence must be every possible code point. Thus the intersection - * is simply 'a'. */ - - if (*i == a) { /* No-op */ - return; - } - - if (*i == NULL) { - *i = invlist_clone(a, NULL); - return; - } - - r = invlist_clone(a, NULL); - invlist_replace_list_destroys_src(*i, r); - SvREFCNT_dec_NN(r); - return; - } - - /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The - * intersection must be empty */ - if (*i == NULL) { - *i = _new_invlist(0); - return; - } - - invlist_clear(*i); - return; - } - - /* Here both lists exist and are non-empty */ - array_a = invlist_array(a); - array_b = invlist_array(b); - - /* If are to take the intersection of 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later */ - if (array_b[0] == 0) { - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - /* Size the intersection for the worst case: that the intersection ends up - * fragmenting everything to be completely disjoint */ - r= _new_invlist(len_a + len_b); - - /* Will contain U+0000 iff both components do */ - array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 - && len_b > 0 && array_b[0] == 0); - - /* Go through each list item by item, stopping when have exhausted one of - * them */ - while (i_a < len_a && i_b < len_b) { - UV cp; /* The element to potentially add to the intersection's - array */ - bool cp_in_set; /* Is it in the input list's set or not */ - - /* We need to take one or the other of the two inputs for the - * intersection. Since we are merging two sorted lists, we take the - * smaller of the next items. In case of a tie, we take first the one - * that is not in its set (a difference from the union algorithm). If - * we first took the one in its set, it would increment the count, - * possibly to 2 which would cause it to be output as starting a range - * in the intersection, and the next time through we would take that - * same number, and output it again as ending the set. By doing the - * opposite of this, there is no possibility that the count will be - * momentarily incremented to 2. (In a tie and both are in the set or - * both not in the set, it doesn't matter which we take first.) */ - if ( array_a[i_a] < array_b[i_b] - || ( array_a[i_a] == array_b[i_b] - && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) - { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp = array_a[i_a++]; - } - else { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); - cp= array_b[i_b++]; - } - - /* Here, have chosen which of the two inputs to look at. Only output - * if the running count changes to/from 2, which marks the - * beginning/end of a range that's in the intersection */ - if (cp_in_set) { - count++; - if (count == 2) { - array_r[i_r++] = cp; - } - } - else { - if (count == 2) { - array_r[i_r++] = cp; - } - count--; - } - - } - - /* The loop above increments the index into exactly one of the input lists - * each iteration, and ends when either index gets to its list end. That - * means the other index is lower than its end, and so something is - * remaining in that one. We increment 'count', as explained below, if the - * exhausted list was in its set. (i_a and i_b each currently index the - * element beyond the one we care about.) */ - if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) - { - count++; - } - - /* Above we incremented 'count' if the exhausted list was in its set. This - * has made it so that 'count' being below 2 means there is nothing left to - * output; otheriwse what's left to add to the intersection is precisely - * that which is left in the non-exhausted input list. - * - * To see why, note first that the exhausted input obviously has nothing - * left to affect the intersection. If it was in its set at its end, that - * means the set extends from here to the platform's infinity, and hence - * anything in the non-exhausted's list will be in the intersection, and - * anything not in it won't be. Hence, the rest of the intersection is - * precisely what's in the non-exhausted list The exhausted set also - * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing - * it means 'count' is now at least 2. This is consistent with the - * incremented 'count' being >= 2 means to add the non-exhausted list to - * the intersection. - * - * But if the exhausted input wasn't in its set, it contributed 0 to - * 'count', and the intersection can't include anything further; the - * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get - * incremented. This is consistent with 'count' being < 2 meaning nothing - * further to add to the intersection. */ - if (count < 2) { /* Nothing left to put in the intersection. */ - len_r = i_r; - } - else { /* copy the non-exhausted list, unchanged. */ - IV copy_count = len_a - i_a; - if (copy_count > 0) { /* a is the one with stuff left */ - Copy(array_a + i_a, array_r + i_r, copy_count, UV); - } - else { /* b is the one with stuff left */ - copy_count = len_b - i_b; - Copy(array_b + i_b, array_r + i_r, copy_count, UV); - } - len_r = i_r + copy_count; - } - - /* Set the result to the final length, which can change the pointer to - * array_r, so re-find it. (Note that it is unlikely that this will - * change, as we are shrinking the space, not enlarging it) */ - if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); - invlist_trim(r); - array_r = invlist_array(r); - } - - if (*i == NULL) { /* Simply return the calculated intersection */ - *i = r; - } - else { /* Otherwise, replace the existing inversion list in '*i'. We could - instead free '*i', and then set it to 'r', but experience has - shown [perl #127392] that if the input is a mortal, we can get a - huge build-up of these during regex compilation before they get - freed. */ - if (len_r) { - invlist_replace_list_destroys_src(*i, r); - } - else { - invlist_clear(*i); - } - SvREFCNT_dec_NN(r); - } - - return; -} - -SV* -Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) -{ - /* Add the range from 'start' to 'end' inclusive to the inversion list's - * set. A pointer to the inversion list is returned. This may actually be - * a new list, in which case the passed in one has been destroyed. The - * passed-in inversion list can be NULL, in which case a new one is created - * with just the one range in it. The new list is not necessarily - * NUL-terminated. Space is not freed if the inversion list shrinks as a - * result of this function. The gain would not be large, and in many - * cases, this is called multiple times on a single inversion list, so - * anything freed may almost immediately be needed again. - * - * This used to mostly call the 'union' routine, but that is much more - * heavyweight than really needed for a single range addition */ - - UV* array; /* The array implementing the inversion list */ - UV len; /* How many elements in 'array' */ - SSize_t i_s; /* index into the invlist array where 'start' - should go */ - SSize_t i_e = 0; /* And the index where 'end' should go */ - UV cur_highest; /* The highest code point in the inversion list - upon entry to this function */ - - /* This range becomes the whole inversion list if none already existed */ - if (invlist == NULL) { - invlist = _new_invlist(2); - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Likewise, if the inversion list is currently empty */ - len = _invlist_len(invlist); - if (len == 0) { - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Starting here, we have to know the internals of the list */ - array = invlist_array(invlist); - - /* If the new range ends higher than the current highest ... */ - cur_highest = invlist_highest(invlist); - if (end > cur_highest) { - - /* If the whole range is higher, we can just append it */ - if (start > cur_highest) { - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Otherwise, add the portion that is higher ... */ - _append_range_to_invlist(invlist, cur_highest + 1, end); - - /* ... and continue on below to handle the rest. As a result of the - * above append, we know that the index of the end of the range is the - * final even numbered one of the array. Recall that the final element - * always starts a range that extends to infinity. If that range is in - * the set (meaning the set goes from here to infinity), it will be an - * even index, but if it isn't in the set, it's odd, and the final - * range in the set is one less, which is even. */ - if (end == UV_MAX) { - i_e = len; - } - else { - i_e = len - 2; - } - } - - /* We have dealt with appending, now see about prepending. If the new - * range starts lower than the current lowest ... */ - if (start < array[0]) { - - /* Adding something which has 0 in it is somewhat tricky, and uncommon. - * Let the union code handle it, rather than having to know the - * trickiness in two code places. */ - if (UNLIKELY(start == 0)) { - SV* range_invlist; - - range_invlist = _new_invlist(2); - _append_range_to_invlist(range_invlist, start, end); - - _invlist_union(invlist, range_invlist, &invlist); - - SvREFCNT_dec_NN(range_invlist); - - return invlist; - } - - /* If the whole new range comes before the first entry, and doesn't - * extend it, we have to insert it as an additional range */ - if (end < array[0] - 1) { - i_s = i_e = -1; - goto splice_in_new_range; - } - - /* Here the new range adjoins the existing first range, extending it - * downwards. */ - array[0] = start; - - /* And continue on below to handle the rest. We know that the index of - * the beginning of the range is the first one of the array */ - i_s = 0; - } - else { /* Not prepending any part of the new range to the existing list. - * Find where in the list it should go. This finds i_s, such that: - * invlist[i_s] <= start < array[i_s+1] - */ - i_s = _invlist_search(invlist, start); - } - - /* At this point, any extending before the beginning of the inversion list - * and/or after the end has been done. This has made it so that, in the - * code below, each endpoint of the new range is either in a range that is - * in the set, or is in a gap between two ranges that are. This means we - * don't have to worry about exceeding the array bounds. - * - * Find where in the list the new range ends (but we can skip this if we - * have already determined what it is, or if it will be the same as i_s, - * which we already have computed) */ - if (i_e == 0) { - i_e = (start == end) - ? i_s - : _invlist_search(invlist, end); - } - - /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] - * is a range that goes to infinity there is no element at invlist[i_e+1], - * so only the first relation holds. */ - - if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { - - /* Here, the ranges on either side of the beginning of the new range - * are in the set, and this range starts in the gap between them. - * - * The new range extends the range above it downwards if the new range - * ends at or above that range's start */ - const bool extends_the_range_above = ( end == UV_MAX - || end + 1 >= array[i_s+1]); - - /* The new range extends the range below it upwards if it begins just - * after where that range ends */ - if (start == array[i_s]) { - - /* If the new range fills the entire gap between the other ranges, - * they will get merged together. Other ranges may also get - * merged, depending on how many of them the new range spans. In - * the general case, we do the merge later, just once, after we - * figure out how many to merge. But in the case where the new - * range exactly spans just this one gap (possibly extending into - * the one above), we do the merge here, and an early exit. This - * is done here to avoid having to special case later. */ - if (i_e - i_s <= 1) { - - /* If i_e - i_s == 1, it means that the new range terminates - * within the range above, and hence 'extends_the_range_above' - * must be true. (If the range above it extends to infinity, - * 'i_s+2' will be above the array's limit, but 'len-i_s-2' - * will be 0, so no harm done.) */ - if (extends_the_range_above) { - Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); - invlist_set_len(invlist, - len - 2, - *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* Here, i_e must == i_s. We keep them in sync, as they apply - * to the same range, and below we are about to decrement i_s - * */ - i_e--; - } - - /* Here, the new range is adjacent to the one below. (It may also - * span beyond the range above, but that will get resolved later.) - * Extend the range below to include this one. */ - array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; - i_s--; - start = array[i_s]; - } - else if (extends_the_range_above) { - - /* Here the new range only extends the range above it, but not the - * one below. It merges with the one above. Again, we keep i_e - * and i_s in sync if they point to the same range */ - if (i_e == i_s) { - i_e++; - } - i_s++; - array[i_s] = start; - } - } - - /* Here, we've dealt with the new range start extending any adjoining - * existing ranges. - * - * If the new range extends to infinity, it is now the final one, - * regardless of what was there before */ - if (UNLIKELY(end == UV_MAX)) { - invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* If i_e started as == i_s, it has also been dealt with, - * and been updated to the new i_s, which will fail the following if */ - if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { - - /* Here, the ranges on either side of the end of the new range are in - * the set, and this range ends in the gap between them. - * - * If this range is adjacent to (hence extends) the range above it, it - * becomes part of that range; likewise if it extends the range below, - * it becomes part of that range */ - if (end + 1 == array[i_e+1]) { - i_e++; - array[i_e] = start; - } - else if (start <= array[i_e]) { - array[i_e] = end + 1; - i_e--; - } - } - - if (i_s == i_e) { - - /* If the range fits entirely in an existing range (as possibly already - * extended above), it doesn't add anything new */ - if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { - return invlist; - } - - /* Here, no part of the range is in the list. Must add it. It will - * occupy 2 more slots */ - splice_in_new_range: - - invlist_extend(invlist, len + 2); - array = invlist_array(invlist); - /* Move the rest of the array down two slots. Don't include any - * trailing NUL */ - Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); - - /* Do the actual splice */ - array[i_e+1] = start; - array[i_e+2] = end + 1; - invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* Here the new range crossed the boundaries of a pre-existing range. The - * code above has adjusted things so that both ends are in ranges that are - * in the set. This means everything in between must also be in the set. - * Just squash things together */ - Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); - invlist_set_len(invlist, - len - i_e + i_s, - *(get_invlist_offset_addr(invlist))); - - return invlist; -} - -SV* -Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, - UV** other_elements_ptr) -{ - /* Create and return an inversion list whose contents are to be populated - * by the caller. The caller gives the number of elements (in 'size') and - * the very first element ('element0'). This function will set - * '*other_elements_ptr' to an array of UVs, where the remaining elements - * are to be placed. - * - * Obviously there is some trust involved that the caller will properly - * fill in the other elements of the array. - * - * (The first element needs to be passed in, as the underlying code does - * things differently depending on whether it is zero or non-zero) */ - - SV* invlist = _new_invlist(size); - bool offset; - - PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; - - invlist = add_cp_to_invlist(invlist, element0); - offset = *get_invlist_offset_addr(invlist); - - invlist_set_len(invlist, size, offset); - *other_elements_ptr = invlist_array(invlist) + 1; - return invlist; -} - -#endif - -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_invert(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list. This adds a 0 if the list didn't - * have a zero; removes it otherwise. As described above, the data - * structure is set up so that this is very efficient */ - - PERL_ARGS_ASSERT__INVLIST_INVERT; - - assert(! invlist_is_iterating(invlist)); - - /* The inverse of matching nothing is matching everything */ - if (_invlist_len(invlist) == 0) { - _append_range_to_invlist(invlist, 0, UV_MAX); - return; - } - - *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); -} - -SV* -Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) -{ - /* Return a new inversion list that is a copy of the input one, which is - * unchanged. The new list will not be mortal even if the old one was. */ - - const STRLEN nominal_length = _invlist_len(invlist); - const STRLEN physical_length = SvCUR(invlist); - const bool offset = *(get_invlist_offset_addr(invlist)); - - PERL_ARGS_ASSERT_INVLIST_CLONE; - - if (new_invlist == NULL) { - new_invlist = _new_invlist(nominal_length); - } - else { - sv_upgrade(new_invlist, SVt_INVLIST); - initialize_invlist_guts(new_invlist, nominal_length); - } - - *(get_invlist_offset_addr(new_invlist)) = offset; - invlist_set_len(new_invlist, nominal_length, offset); - Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); - - return new_invlist; -} - -#endif - -PERL_STATIC_INLINE UV -S_invlist_lowest(SV* const invlist) -{ - /* Returns the lowest code point that matches an inversion list. This API - * has an ambiguity, as it returns 0 under either the lowest is actually - * 0, or if the list is empty. If this distinction matters to you, check - * for emptiness before calling this function */ - - UV len = _invlist_len(invlist); - UV *array; - - PERL_ARGS_ASSERT_INVLIST_LOWEST; - - if (len == 0) { - return 0; - } - - array = invlist_array(invlist); - - return array[0]; -} - -STATIC SV * -S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) -{ - /* Get the contents of an inversion list into a string SV so that they can - * be printed out. If 'traditional_style' is TRUE, it uses the format - * traditionally done for debug tracing; otherwise it uses a format - * suitable for just copying to the output, with blanks between ranges and - * a dash between range components */ - - UV start, end; - SV* output; - const char intra_range_delimiter = (traditional_style ? '\t' : '-'); - const char inter_range_delimiter = (traditional_style ? '\n' : ' '); - - if (traditional_style) { - output = newSVpvs("\n"); - } - else { - output = newSVpvs(""); - } - - PERL_ARGS_ASSERT_INVLIST_CONTENTS; - - assert(! invlist_is_iterating(invlist)); - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c", - start, intra_range_delimiter, - inter_range_delimiter); - } - else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c", - start, - intra_range_delimiter, - end, inter_range_delimiter); - } - else { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c", - start, inter_range_delimiter); - } - } - - if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ - SvCUR_set(output, SvCUR(output) - 1); - } - - return output; -} - -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, - const char * const indent, SV* const invlist) -{ - /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the - * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by - * the string 'indent'. The output looks like this: - [0] 0x000A .. 0x000D - [2] 0x0085 - [4] 0x2028 .. 0x2029 - [6] 0x3104 .. INFTY - * This means that the first range of code points matched by the list are - * 0xA through 0xD; the second range contains only the single code point - * 0x85, etc. An inversion list is an array of UVs. Two array elements - * are used to define each range (except if the final range extends to - * infinity, only a single element is needed). The array index of the - * first element for the corresponding range is given in brackets. */ - - UV start, end; - STRLEN count = 0; - - PERL_ARGS_ASSERT__INVLIST_DUMP; - - if (invlist_is_iterating(invlist)) { - Perl_dump_indent(aTHX_ level, file, - "%sCan't dump inversion list because is in middle of iterating\n", - indent); - return; - } - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (end == UV_MAX) { - Perl_dump_indent(aTHX_ level, file, - "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n", - indent, (UV)count, start); - } - else if (end != start) { - Perl_dump_indent(aTHX_ level, file, - "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", - indent, (UV)count, start, end); - } - else { - Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", - indent, (UV)count, start); - } - count += 2; - } -} - -#endif - -#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) -bool -Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) -{ - /* Return a boolean as to if the two passed in inversion lists are - * identical. The final argument, if TRUE, says to take the complement of - * the second inversion list before doing the comparison */ - - const UV len_a = _invlist_len(a); - UV len_b = _invlist_len(b); - - const UV* array_a = NULL; - const UV* array_b = NULL; - - PERL_ARGS_ASSERT__INVLISTEQ; - - /* This code avoids accessing the arrays unless it knows the length is - * non-zero */ - - if (len_a == 0) { - if (len_b == 0) { - return ! complement_b; - } - } - else { - array_a = invlist_array(a); - } - - if (len_b != 0) { - array_b = invlist_array(b); - } - - /* If are to compare 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* The complement of nothing is everything, so <a> would have to have - * just one element, starting at zero (ending at infinity) */ - if (len_b == 0) { - return (len_a == 1 && array_a[0] == 0); - } - if (array_b[0] == 0) { - - /* Otherwise, to complement, we invert. Here, the first element is - * 0, just remove it. To do this, we just pretend the array starts - * one later */ - - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - return len_a == len_b - && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); - -} -#endif - -/* - * As best we can, determine the characters that can match the start of - * the given EXACTF-ish node. This is for use in creating ssc nodes, so there - * can be false positive matches - * - * Returns the invlist as a new SV*; it is the caller's responsibility to - * call SvREFCNT_dec() when done with it. - */ -STATIC SV* -S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) -{ - const U8 * s = (U8*)STRING(node); - SSize_t bytelen = STR_LEN(node); - UV uc; - /* Start out big enough for 2 separate code points */ - SV* invlist = _new_invlist(4); - - PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; - - if (! UTF) { - uc = *s; - - /* We punt and assume can match anything if the node begins - * with a multi-character fold. Things are complicated. For - * example, /ffi/i could match any of: - * "\N{LATIN SMALL LIGATURE FFI}" - * "\N{LATIN SMALL LIGATURE FF}I" - * "F\N{LATIN SMALL LIGATURE FI}" - * plus several other things; and making sure we have all the - * possibilities is hard. */ - if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { - invlist = _add_range_to_invlist(invlist, 0, UV_MAX); - } - else { - /* Any Latin1 range character can potentially match any - * other depending on the locale, and in Turkic locales, 'I' and - * 'i' can match U+130 and U+131 */ - if (OP(node) == EXACTFL) { - _invlist_union(invlist, PL_Latin1, &invlist); - if (isALPHA_FOLD_EQ(uc, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - } - else { - /* But otherwise, it matches at least itself. We can - * quickly tell if it has a distinct fold, and if so, - * it matches that as well */ - invlist = add_cp_to_invlist(invlist, uc); - if (IS_IN_SOME_FOLD_L1(uc)) - invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); - } - - /* Some characters match above-Latin1 ones under /i. This - * is true of EXACTFL ones when the locale is UTF-8 */ - if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) - && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, - EXACTFAA_NO_TRIE))) - { - add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); - } - } - } - else { /* Pattern is UTF-8 */ - U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; - const U8* e = s + bytelen; - IV fc; - - fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); - - /* The only code points that aren't folded in a UTF EXACTFish - * node are the problematic ones in EXACTFL nodes */ - if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { - /* We need to check for the possibility that this EXACTFL - * node begins with a multi-char fold. Therefore we fold - * the first few characters of it so that we can make that - * check */ - U8 *d = folded; - int i; - - fc = -1; - for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { - if (isASCII(*s)) { - *(d++) = (U8) toFOLD(*s); - if (fc < 0) { /* Save the first fold */ - fc = *(d-1); - } - s++; - } - else { - STRLEN len; - UV fold = toFOLD_utf8_safe(s, e, d, &len); - if (fc < 0) { /* Save the first fold */ - fc = fold; - } - d += len; - s += UTF8SKIP(s); - } - } - - /* And set up so the code below that looks in this folded - * buffer instead of the node's string */ - e = d; - s = folded; - } - - /* When we reach here 's' points to the fold of the first - * character(s) of the node; and 'e' points to far enough along - * the folded string to be just past any possible multi-char - * fold. - * - * Like the non-UTF case above, we punt if the node begins with a - * multi-char fold */ - - if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { - invlist = _add_range_to_invlist(invlist, 0, UV_MAX); - } - else { /* Single char fold */ - unsigned int k; - U32 first_fold; - const U32 * remaining_folds; - Size_t folds_count; - - /* It matches itself */ - invlist = add_cp_to_invlist(invlist, fc); - - /* ... plus all the things that fold to it, which are found in - * PL_utf8_foldclosures */ - folds_count = _inverse_folds(fc, &first_fold, - &remaining_folds); - for (k = 0; k < folds_count; k++) { - UV c = (k == 0) ? first_fold : remaining_folds[k-1]; - - /* /aa doesn't allow folds between ASCII and non- */ - if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) - && isASCII(c) != isASCII(fc)) - { - continue; - } - - invlist = add_cp_to_invlist(invlist, c); - } - - if (OP(node) == EXACTFL) { - - /* If either [iI] are present in an EXACTFL node the above code - * should have added its normal case pair, but under a Turkish - * locale they could match instead the case pairs from it. Add - * those as potential matches as well */ - if (isALPHA_FOLD_EQ(fc, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { - invlist = add_cp_to_invlist(invlist, 'I'); - } - else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { - invlist = add_cp_to_invlist(invlist, 'i'); - } - } - } - } - - return invlist; -} - -#undef HEADER_LENGTH -#undef TO_INTERNAL_SIZE -#undef FROM_INTERNAL_SIZE -#undef INVLIST_VERSION_ID - -/* End of inversion list object */ STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) @@ -11451,12 +2669,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) -#ifdef DEBUGGING -#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) -#else -#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) -#endif STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, @@ -11484,7 +2696,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, } if (sv_dat) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); + num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void_NN(sv_dat); } @@ -11708,7 +2920,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Too many nested open parens"); } - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ /* Having this true makes it feasible to have a lot fewer tests for the * parse pointer being in scope. For example, we can write @@ -12034,7 +3246,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_seen |= REG_VERBARG_SEEN; if (start_arg) { SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(REGNODE_p(ret)) = add_data( pRExC_state, + ARG(REGNODE_p(ret)) = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv; FLAGS(REGNODE_p(ret)) = 1; @@ -12066,10 +3278,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_parse > RExC_end) { paren = '\0'; } - ret = 0; /* For look-ahead/behind. */ + ret = 0; /* For look-ahead/behind. */ switch (paren) { - case 'P': /* (?P...) variants for those used to PCRE/Python */ + case 'P': /* (?P...) variants for those used to PCRE/Python */ paren = *RExC_parse; if ( paren == '<') { /* (?P<...>) named capture */ RExC_parse_inc_by(1); @@ -12412,13 +3624,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_parse_set(RExC_start + cb->end); o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, STR_WITH_LEN("rl")); + n = reg_add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, + n = reg_add_data(pRExC_state, (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } @@ -12511,7 +3723,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } RExC_parse_inc_by(1); if (sv_dat) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); + num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void_NN(sv_dat); } @@ -12783,12 +3995,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) else if (paren == ':') { *flagp |= flags&SIMPLE; } - if (is_open) { /* Starts with OPEN. */ + if (is_open) { /* Starts with OPEN. */ if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ REQUIRE_BRANCHJ(flagp, 0); } } - else if (paren != '?') /* Not Conditional */ + else if (paren != '?') /* Not Conditional */ ret = br; *flagp |= flags & (HASWIDTH | POSTPONED); lastbr = br; @@ -13005,7 +4217,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Unmatched )"); } else - FAIL("Junk on end of regexp"); /* "Can't happen". */ + FAIL("Junk on end of regexp"); /* "Can't happen". */ NOT_REACHED; /* NOTREACHED */ } @@ -13052,7 +4264,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } } - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); @@ -13082,7 +4294,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) chain = latest; c++; } - if (chain == 0) { /* Loop ran zero times. */ + if (chain == 0) { /* Loop ran zero times. */ chain = reg_node(pRExC_state, NOTHING); if (ret == 0) ret = chain; @@ -14004,14 +5216,6 @@ S_backref_value(char *p, char *e) return I32_MAX; } -#ifdef DEBUGGING -#define REGNODE_GUTS(state,op,extra_size) \ - regnode_guts_debug(state,op,extra_size) -#else -#define REGNODE_GUTS(state,op,extra_size) \ - regnode_guts(state,extra_size) -#endif - /* - regatom - the lowest level @@ -14093,7 +5297,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DECLARE_AND_GET_RE_DEBUG_FLAGS; - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ DEBUG_PARSE("atom"); @@ -14249,7 +5453,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, SEOL); } - RExC_seen_zerolen++; /* Do not optimize RE away */ + RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'z': if (RExC_pm_flags & PMf_WILDCARD) { @@ -14259,7 +5463,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, EOS); } - RExC_seen_zerolen++; /* Do not optimize RE away */ + RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': vFAIL("\\C no longer supported"); @@ -16046,8 +7250,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } -STATIC void -S_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +void +Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) { /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It * sets up the bitmap and any flags, removing those code points from the @@ -16962,7 +8166,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, /* Handle the (?[...]) construct to do set operations */ U8 curchar; /* Current character being parsed */ - UV start, end; /* End points of code point ranges */ + UV start, end; /* End points of code point ranges */ SV* final = NULL; /* The end result inversion list */ SV* result_string; /* 'final' stringified */ AV* stack; /* stack of operators and operands not yet @@ -17698,8 +8902,8 @@ S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, #undef IS_OPERATOR #undef IS_OPERAND -STATIC void -S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +void +Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) { /* This adds the Latin1/above-Latin1 folding rules. * @@ -18073,7 +9277,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, assert(RExC_parse <= RExC_end); - if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ + if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ RExC_parse_inc_by(1); invert = TRUE; allow_mutiple_chars = FALSE; @@ -18236,16 +9440,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, U32 packed_warn; U8 grok_c_char; - case 'w': namedclass = ANYOF_WORDCHAR; break; - case 'W': namedclass = ANYOF_NWORDCHAR; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'v': namedclass = ANYOF_VERTWS; break; - case 'V': namedclass = ANYOF_NVERTWS; break; - case 'h': namedclass = ANYOF_HORIZWS; break; - case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { const char * const backslash_N_beg = RExC_parse - 2; @@ -18544,15 +9748,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, named */ } break; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; - case 'e': value = ESC_NATIVE; break; - case 'a': value = '\a'; break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ESC_NATIVE; break; + case 'a': value = '\a'; break; case 'o': - RExC_parse--; /* function expects to be pointed at the 'o' */ + RExC_parse--; /* function expects to be pointed at the 'o' */ if (! grok_bslash_o(&RExC_parse, RExC_end, &value, @@ -18574,7 +9778,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } break; case 'x': - RExC_parse--; /* function expects to be pointed at the 'x' */ + RExC_parse--; /* function expects to be pointed at the 'x' */ if (! grok_bslash_x(&RExC_parse, RExC_end, &value, @@ -18886,8 +10090,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, cp_list = add_cp_to_invlist(cp_list, '-'); element_count++; } else - range = 1; /* yeah, it's a range! */ - continue; /* but do it the next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } } @@ -19277,7 +10481,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * ones already on the list */ if (cp_foldable_list) { if (FOLD) { - UV start, end; /* End points of code point ranges */ + UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; SV** use_list; @@ -20711,8 +11915,8 @@ S_optimize_regclass(pTHX_ #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION -STATIC void -S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, +void +Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, @@ -20724,7 +11928,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE * - * Otherwise, it sets the argument to the count returned by add_data(), + * Otherwise, it sets the argument to the count returned by reg_add_data(), * having allocated and stored an array, av, as follows: * av[0] stores the inversion list defining this class as far as known at * this time, or PL_sv_undef if nothing definite is now known. @@ -20863,7 +12067,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, } rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, STR_WITH_LEN("s")); + n = reg_add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(node, n); } @@ -21437,7 +12641,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, StructCopy(--src, --dst, regnode); } - place = REGNODE_p(operand); /* Op node, where operand used to be. */ + place = REGNODE_p(operand); /* Op node, where operand used to be. */ src = place + 1; /* NOT REGNODE_AFTER! */ FLAGS(place) = 0; FILL_NODE(operand, op); @@ -21548,7 +12752,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, regnode * const temp = regnext(REGNODE_p(scan)); #ifdef EXPERIMENTAL_INPLACESCAN if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { - bool unfolded_multi_char; /* Unexamined in this routine */ + bool unfolded_multi_char; /* Unexamined in this routine */ if (join_exact(pRExC_state, scan, &min, &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) return TRUE; /* Was return EXACT */ @@ -21606,8 +12810,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, } #endif -STATIC SV* -S_get_ANYOFM_contents(pTHX_ const regnode * n) { +SV* +Perl_get_ANYOFM_contents(pTHX_ const regnode * n) { /* Returns an inversion list of all the code points matched by the * ANYOFM/NANYOFM node 'n' */ @@ -21639,8 +12843,8 @@ S_get_ANYOFM_contents(pTHX_ const regnode * n) { return cp_list; } -STATIC SV * -S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { +SV * +Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS; SV * cp_list = NULL; @@ -21655,716 +12859,11 @@ S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { return cp_list; } -/* - - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form - */ -#ifdef DEBUGGING - -static void -S_regdump_intflags(pTHX_ const char *lead, const U32 flags) -{ - int bit; - int set=0; - - ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); - - for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { - if (flags & (1<<bit)) { - if (!set++ && lead) - Perl_re_printf( aTHX_ "%s", lead); - Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); - } - } - if (lead) { - if (set) - Perl_re_printf( aTHX_ "\n"); - else - Perl_re_printf( aTHX_ "%s[none-set]\n", lead); - } -} - -static void -S_regdump_extflags(pTHX_ const char *lead, const U32 flags) -{ - int bit; - int set=0; - regex_charset cs; - - ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); - - for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { - if (flags & (1U<<bit)) { - if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ - continue; - } - if (!set++ && lead) - Perl_re_printf( aTHX_ "%s", lead); - Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); - } - } - if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { - if (!set++ && lead) { - Perl_re_printf( aTHX_ "%s", lead); - } - switch (cs) { - case REGEX_UNICODE_CHARSET: - Perl_re_printf( aTHX_ "UNICODE"); - break; - case REGEX_LOCALE_CHARSET: - Perl_re_printf( aTHX_ "LOCALE"); - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); - break; - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); - break; - default: - Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); - break; - } - } - if (lead) { - if (set) - Perl_re_printf( aTHX_ "\n"); - else - Perl_re_printf( aTHX_ "%s[none-set]\n", lead); - } -} -#endif - -void -Perl_regdump(pTHX_ const regexp *r) -{ -#ifdef DEBUGGING - int i; - SV * const sv = sv_newmortal(); - SV *dsv= sv_newmortal(); - RXi_GET_DECL(r, ri); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REGDUMP; - - (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); - - /* Header fields of interest. */ - for (i = 0; i < 2; i++) { - if (r->substrs->data[i].substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, - SvPVX_const(r->substrs->data[i].substr), - RE_SV_DUMPLEN(r->substrs->data[i].substr), - PL_dump_re_max_len); - Perl_re_printf( aTHX_ - "%s %s%s at %" IVdf "..%" UVuf " ", - i ? "floating" : "anchored", - s, - RE_SV_TAIL(r->substrs->data[i].substr), - (IV)r->substrs->data[i].min_offset, - (UV)r->substrs->data[i].max_offset); - } - else if (r->substrs->data[i].utf8_substr) { - RE_PV_QUOTED_DECL(s, 1, dsv, - SvPVX_const(r->substrs->data[i].utf8_substr), - RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), - 30); - Perl_re_printf( aTHX_ - "%s utf8 %s%s at %" IVdf "..%" UVuf " ", - i ? "floating" : "anchored", - s, - RE_SV_TAIL(r->substrs->data[i].utf8_substr), - (IV)r->substrs->data[i].min_offset, - (UV)r->substrs->data[i].max_offset); - } - } - - if (r->check_substr || r->check_utf8) - Perl_re_printf( aTHX_ - (const char *) - ( r->check_substr == r->substrs->data[1].substr - && r->check_utf8 == r->substrs->data[1].utf8_substr - ? "(checking floating" : "(checking anchored")); - if (r->intflags & PREGf_NOSCAN) - Perl_re_printf( aTHX_ " noscan"); - if (r->extflags & RXf_CHECK_ALL) - Perl_re_printf( aTHX_ " isall"); - if (r->check_substr || r->check_utf8) - Perl_re_printf( aTHX_ ") "); - - if (ri->regstclass) { - regprop(r, sv, ri->regstclass, NULL, NULL); - Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); - } - if (r->intflags & PREGf_ANCH) { - Perl_re_printf( aTHX_ "anchored"); - if (r->intflags & PREGf_ANCH_MBOL) - Perl_re_printf( aTHX_ "(MBOL)"); - if (r->intflags & PREGf_ANCH_SBOL) - Perl_re_printf( aTHX_ "(SBOL)"); - if (r->intflags & PREGf_ANCH_GPOS) - Perl_re_printf( aTHX_ "(GPOS)"); - Perl_re_printf( aTHX_ " "); - } - if (r->intflags & PREGf_GPOS_SEEN) - Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); - if (r->intflags & PREGf_SKIP) - Perl_re_printf( aTHX_ "plus "); - if (r->intflags & PREGf_IMPLICIT) - Perl_re_printf( aTHX_ "implicit "); - Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); - if (r->extflags & RXf_EVAL_SEEN) - Perl_re_printf( aTHX_ "with eval "); - Perl_re_printf( aTHX_ "\n"); - DEBUG_FLAGS_r({ - regdump_extflags("r->extflags: ", r->extflags); - regdump_intflags("r->intflags: ", r->intflags); - }); -#else - PERL_ARGS_ASSERT_REGDUMP; - PERL_UNUSED_CONTEXT; - PERL_UNUSED_ARG(r); -#endif /* DEBUGGING */ -} - -/* Should be synchronized with ANYOF_ #defines in regcomp.h */ -#ifdef DEBUGGING - -# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ - || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ - || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ - || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ - || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ - || CC_VERTSPACE_ != 15 -# error Need to adjust order of anyofs[] -# endif -static const char * const anyofs[] = { - "\\w", - "\\W", - "\\d", - "\\D", - "[:alpha:]", - "[:^alpha:]", - "[:lower:]", - "[:^lower:]", - "[:upper:]", - "[:^upper:]", - "[:punct:]", - "[:^punct:]", - "[:print:]", - "[:^print:]", - "[:alnum:]", - "[:^alnum:]", - "[:graph:]", - "[:^graph:]", - "[:cased:]", - "[:^cased:]", - "\\s", - "\\S", - "[:blank:]", - "[:^blank:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:ascii:]", - "[:^ascii:]", - "\\v", - "\\V" -}; -#endif - -/* -- regprop - printable representation of opcode, with run time support -*/ - -void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) -{ -#ifdef DEBUGGING - U8 k; - const U8 op = OP(o); - RXi_GET_DECL(prog, progi); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REGPROP; - - SvPVCLEAR(sv); - - if (op > REGNODE_MAX) { /* regnode.type is unsigned */ - if (pRExC_state) { /* This gives more info, if we have it */ - FAIL3("panic: corrupted regexp opcode %d > %d", - (int)op, (int)REGNODE_MAX); - } - else { - Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", - (int)op, (int)REGNODE_MAX); - } - } - sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ - - k = REGNODE_TYPE(op); - - if (k == EXACT) { - sv_catpvs(sv, " "); - /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) - * is a crude hack but it may be the best for now since - * we have no flag "this EXACTish node was UTF-8" - * --jhi */ - pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_ESCAPE_NONASCII | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT | - PERL_PV_PRETTY_NOCLEAR - ); - } else if (k == TRIE) { - /* print the details of the trie in dumpuntil instead, as - * progi->data isn't available here */ - const U32 n = ARG(o); - const reg_ac_data * const ac = IS_TRIE_AC(op) ? - (reg_ac_data *)progi->data->data[n] : - NULL; - const reg_trie_data * const trie - = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - - Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags)); - DEBUG_TRIE_COMPILE_r({ - if (trie->jump) - sv_catpvs(sv, "(JUMP)"); - Perl_sv_catpvf(aTHX_ sv, - "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ); - }); - if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - sv_catpvs(sv, "["); - (void) put_charclass_bitmap_innards(sv, - ((IS_ANYOF_TRIE(op)) - ? ANYOF_BITMAP(o) - : TRIE_BITMAP(trie)), - NULL, - NULL, - NULL, - 0, - FALSE - ); - sv_catpvs(sv, "]"); - } - } else if (k == CURLY) { - U32 lo = ARG1(o), hi = ARG2(o); - if (op == CURLYM || op == CURLYN || op == CURLYX) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ - Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); - if (hi == REG_INFTY) - sv_catpvs(sv, "INFTY"); - else - Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); - sv_catpvs(sv, "}"); - } - else if (k == WHILEM && o->flags) /* Ordinal/of */ - Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE - || k == GROUPP || op == ACCEPT) - { - AV *name_list= NULL; - U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) : - (op == OPEN || op == CLOSE) ? (U32)PARNO(o) : - (U32)ARG(o); - Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ - if ( RXp_PAREN_NAMES(prog) ) { - name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); - } else if ( pRExC_state ) { - name_list= RExC_paren_name_list; - } - if ( name_list ) { - if ( k != REF || (op < REFN)) { - SV **name= av_fetch_simple(name_list, parno, 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - else - if (parno > 0) { - /* parno must always be larger than 0 for this block - * as it represents a slot into the data array, which - * has the 0 slot reserved for a placeholder so any valid - * index into it is always true, eg non-zero - * see the '%' "what" type and the implementation of - * S_add_data() - */ - SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); - I32 *nums=(I32*)SvPVX(sv_dat); - SV **name= av_fetch_simple(name_list, nums[0], 0 ); - I32 n; - if (name) { - for ( n=0; n<SvIVX(sv_dat); n++ ) { - Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, - (n ? "," : ""), (IV)nums[n]); - } - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - } - } - if ( k == REF && reginfo) { - U32 n = ARG(o); /* which paren pair */ - I32 ln = prog->offs[n].start; - if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1) - Perl_sv_catpvf(aTHX_ sv, ": FAIL"); - else if (ln == prog->offs[n].end) - Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); - else { - const char *s = reginfo->strbeg + ln; - Perl_sv_catpvf(aTHX_ sv, ": "); - Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, - PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); - } - } - } else if (k == GOSUB) { - AV *name_list= NULL; - if ( RXp_PAREN_NAMES(prog) ) { - name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); - } else if ( pRExC_state ) { - name_list= RExC_paren_name_list; - } - - /* Paren and offset */ - Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), - (int)((o + (int)ARG2L(o)) - progi->program) ); - if (name_list) { - SV **name= av_fetch_simple(name_list, ARG(o), 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - } - else if (k == LOGICAL) - /* 2: embedded, otherwise 1 */ - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); - else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { - U8 flags; - char * bitmap; - U8 do_sep = 0; /* Do we need to separate various components of the - output? */ - /* Set if there is still an unresolved user-defined property */ - SV *unresolved = NULL; - - /* Things that are ignored except when the runtime locale is UTF-8 */ - SV *only_utf8_locale_invlist = NULL; - - /* Code points that don't fit in the bitmap */ - SV *nonbitmap_invlist = NULL; - - /* And things that aren't in the bitmap, but are small enough to be */ - SV* bitmap_range_not_in_bitmap = NULL; - - bool inverted; - - if (k != ANYOF) { - flags = 0; - bitmap = NULL; - } - else { - flags = ANYOF_FLAGS(o); - bitmap = ANYOF_BITMAP(o); - } - - if (op == ANYOFL || op == ANYOFPOSIXL) { - if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { - sv_catpvs(sv, "{utf8-locale-reqd}"); - } - if (flags & ANYOFL_FOLD) { - sv_catpvs(sv, "{i}"); - } - } - - inverted = flags & ANYOF_INVERT; - - /* If there is stuff outside the bitmap, get it */ - if (k == ANYOFR) { - - /* For a single range, split into the parts inside vs outside the - * bitmap. */ - UV start = ANYOFRbase(o); - UV end = ANYOFRbase(o) + ANYOFRdelta(o); - - if (start < NUM_ANYOF_CODE_POINTS) { - if (end < NUM_ANYOF_CODE_POINTS) { - bitmap_range_not_in_bitmap - = _add_range_to_invlist(bitmap_range_not_in_bitmap, - start, end); - } - else { - bitmap_range_not_in_bitmap - = _add_range_to_invlist(bitmap_range_not_in_bitmap, - start, NUM_ANYOF_CODE_POINTS); - start = NUM_ANYOF_CODE_POINTS; - } - } - - if (start >= NUM_ANYOF_CODE_POINTS) { - nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, - ANYOFRbase(o), - ANYOFRbase(o) + ANYOFRdelta(o)); - } - } - else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { - nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, - NUM_ANYOF_CODE_POINTS, - UV_MAX); - } - else if (ANYOF_HAS_AUX(o)) { - (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, - &unresolved, - &only_utf8_locale_invlist, - &nonbitmap_invlist); - - /* The aux data may contain stuff that could fit in the bitmap. - * This could come from a user-defined property being finally - * resolved when this call was done; or much more likely because - * there are matches that require UTF-8 to be valid, and so aren't - * in the bitmap (or ANYOFR). This is teased apart later */ - _invlist_intersection(nonbitmap_invlist, - PL_InBitmap, - &bitmap_range_not_in_bitmap); - /* Leave just the things that don't fit into the bitmap */ - _invlist_subtract(nonbitmap_invlist, - PL_InBitmap, - &nonbitmap_invlist); - } - - /* Ready to start outputting. First, the initial left bracket */ - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - - if ( bitmap - || bitmap_range_not_in_bitmap - || only_utf8_locale_invlist - || unresolved) - { - /* Then all the things that could fit in the bitmap */ - do_sep = put_charclass_bitmap_innards( - sv, - bitmap, - bitmap_range_not_in_bitmap, - only_utf8_locale_invlist, - o, - flags, - - /* Can't try inverting for a - * better display if there - * are things that haven't - * been resolved */ - (unresolved != NULL || k == ANYOFR)); - SvREFCNT_dec(bitmap_range_not_in_bitmap); - - /* If there are user-defined properties which haven't been defined - * yet, output them. If the result is not to be inverted, it is - * clearest to output them in a separate [] from the bitmap range - * stuff. If the result is to be complemented, we have to show - * everything in one [], as the inversion applies to the whole - * thing. Use {braces} to separate them from anything in the - * bitmap and anything above the bitmap. */ - if (unresolved) { - if (inverted) { - if (! do_sep) { /* If didn't output anything in the bitmap - */ - sv_catpvs(sv, "^"); - } - sv_catpvs(sv, "{"); - } - else if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], - PL_colors[0]); - } - sv_catsv(sv, unresolved); - if (inverted) { - sv_catpvs(sv, "}"); - } - do_sep = ! inverted; - } - else if ( do_sep == 2 - && ! nonbitmap_invlist - && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) - { - /* Here, the display shows the class as inverted, and - * everything above the lower display should also match, but - * there is no indication of that. Add this range so the code - * below will add it to the display */ - _invlist_union_complement_2nd(nonbitmap_invlist, - PL_InBitmap, - &nonbitmap_invlist); - } - } - - /* And, finally, add the above-the-bitmap stuff */ - if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { - SV* contents; - - /* See if truncation size is overridden */ - const STRLEN dump_len = (PL_dump_re_max_len > 256) - ? PL_dump_re_max_len - : 256; - - /* This is output in a separate [] */ - if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); - } - - /* And, for easy of understanding, it is shown in the - * uncomplemented form if possible. The one exception being if - * there are unresolved items, where the inversion has to be - * delayed until runtime */ - if (inverted && ! unresolved) { - _invlist_invert(nonbitmap_invlist); - _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); - } - - contents = invlist_contents(nonbitmap_invlist, - FALSE /* output suitable for catsv */ - ); - - /* If the output is shorter than the permissible maximum, just do it. */ - if (SvCUR(contents) <= dump_len) { - sv_catsv(sv, contents); - } - else { - const char * contents_string = SvPVX(contents); - STRLEN i = dump_len; - - /* Otherwise, start at the permissible max and work back to the - * first break possibility */ - while (i > 0 && contents_string[i] != ' ') { - i--; - } - if (i == 0) { /* Fail-safe. Use the max if we couldn't - find a legal break */ - i = dump_len; - } - - sv_catpvn(sv, contents_string, i); - sv_catpvs(sv, "..."); - } - - SvREFCNT_dec_NN(contents); - SvREFCNT_dec_NN(nonbitmap_invlist); - } - - /* And finally the matching, closing ']' */ - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - if (op == ANYOFHs) { - Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); - } - else if (REGNODE_TYPE(op) != ANYOF) { - U8 lowest = (op != ANYOFHr) - ? FLAGS(o) - : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); - U8 highest = (op == ANYOFHr) - ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) - : (op == ANYOFH || op == ANYOFR) - ? 0xFF - : lowest; -#ifndef EBCDIC - if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) -#endif - { - Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); - if (lowest != highest) { - Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); - } - Perl_sv_catpvf(aTHX_ sv, ")"); - } - } - - SvREFCNT_dec(unresolved); - } - else if (k == ANYOFM) { - SV * cp_list = get_ANYOFM_contents(o); - - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (op == NANYOFM) { - _invlist_invert(cp_list); - } - - put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - SvREFCNT_dec(cp_list); - } - else if (k == ANYOFHbbm) { - SV * cp_list = get_ANYOFHbbm_contents(o); - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - - sv_catsv(sv, invlist_contents(cp_list, - FALSE /* output suitable for catsv */ - )); - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - SvREFCNT_dec(cp_list); - } - else if (k == POSIXD || k == NPOSIXD) { - U8 index = FLAGS(o) * 2; - if (index < C_ARRAY_LENGTH(anyofs)) { - if (*anyofs[index] != '[') { - sv_catpvs(sv, "["); - } - sv_catpv(sv, anyofs[index]); - if (*anyofs[index] != '[') { - sv_catpvs(sv, "]"); - } - } - else { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); - } - } - else if (k == BOUND || k == NBOUND) { - /* Must be synced with order of 'bound_type' in regcomp.h */ - const char * const bounds[] = { - "", /* Traditional */ - "{gcb}", - "{lb}", - "{sb}", - "{wb}" - }; - assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); - sv_catpv(sv, bounds[FLAGS(o)]); - } - else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { - Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); - if (o->next_off) { - Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); - } - Perl_sv_catpvf(aTHX_ sv, "]"); - } - else if (op == SBOL) - Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); - - /* add on the verb argument if there is one */ - if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) { - if ( ARG(o) ) - Perl_sv_catpvf(aTHX_ sv, ":%" SVf, - SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); - else - sv_catpvs(sv, ":NULL"); - } -#else - PERL_UNUSED_CONTEXT; - PERL_UNUSED_ARG(sv); - PERL_UNUSED_ARG(o); - PERL_UNUSED_ARG(prog); - PERL_UNUSED_ARG(reginfo); - PERL_UNUSED_ARG(pRExC_state); -#endif /* DEBUGGING */ -} - SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) -{ /* Assume that RE_INTUIT is set */ +{ /* Assume that RE_INTUIT is set */ /* Returns an SV containing a string that must appear in the target for it * to match, or NULL if nothing is known that must match. * @@ -22686,7 +13185,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; case '%': - /* NO-OP a '%' data contains a null pointer, so that add_data + /* NO-OP a '%' data contains a null pointer, so that reg_add_data * always returns non-zero, this should only ever happen in the * 0 index */ assert(n==0); @@ -22703,9 +13202,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) Safefree(ri); } -#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t)) -#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t)) -#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) +#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t)) +#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t)) +#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) /* =for apidoc re_dup_guts @@ -22921,7 +13420,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case '%': /* this is a placeholder type, it exists purely so that - * add_data always returns a non-zero value, this type of + * reg_add_data always returns a non-zero value, this type of * entry should ONLY be present in the 0 slot of the array */ assert(i == 0); d->data[i]= ri->data->data[i]; @@ -23014,778 +13513,6 @@ Perl_save_re_context(pTHX) } #endif -#ifdef DEBUGGING - -STATIC void -S_put_code_point(pTHX_ SV *sv, UV c) -{ - PERL_ARGS_ASSERT_PUT_CODE_POINT; - - if (c > 255) { - Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); - } - else if (isPRINT(c)) { - const char string = (char) c; - - /* We use {phrase} as metanotation in the class, so also escape literal - * braces */ - if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') - sv_catpvs(sv, "\\"); - sv_catpvn(sv, &string, 1); - } - else if (isMNEMONIC_CNTRL(c)) { - Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); - } -} - -STATIC void -S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) -{ - /* Appends to 'sv' a displayable version of the range of code points from - * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls - * that have them, when they occur at the beginning or end of the range. - * It uses hex to output the remaining code points, unless 'allow_literals' - * is true, in which case the printable ASCII ones are output as-is (though - * some of these will be escaped by put_code_point()). - * - * NOTE: This is designed only for printing ranges of code points that fit - * inside an ANYOF bitmap. Higher code points are simply suppressed - */ - - const unsigned int min_range_count = 3; - - assert(start <= end); - - PERL_ARGS_ASSERT_PUT_RANGE; - - while (start <= end) { - UV this_end; - const char * format; - - if ( end - start < min_range_count - && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) - { - /* Output a range of 1 or 2 chars individually, or longer ranges - * when printable */ - for (; start <= end; start++) { - put_code_point(sv, start); - } - break; - } - - /* If permitted by the input options, and there is a possibility that - * this range contains a printable literal, look to see if there is - * one. */ - if (allow_literals && start <= MAX_PRINT_A) { - - /* If the character at the beginning of the range isn't an ASCII - * printable, effectively split the range into two parts: - * 1) the portion before the first such printable, - * 2) the rest - * and output them separately. */ - if (! isPRINT_A(start)) { - UV temp_end = start + 1; - - /* There is no point looking beyond the final possible - * printable, in MAX_PRINT_A */ - UV max = MIN(end, MAX_PRINT_A); - - while (temp_end <= max && ! isPRINT_A(temp_end)) { - temp_end++; - } - - /* Here, temp_end points to one beyond the first printable if - * found, or to one beyond 'max' if not. If none found, make - * sure that we use the entire range */ - if (temp_end > MAX_PRINT_A) { - temp_end = end + 1; - } - - /* Output the first part of the split range: the part that - * doesn't have printables, with the parameter set to not look - * for literals (otherwise we would infinitely recurse) */ - put_range(sv, start, temp_end - 1, FALSE); - - /* The 2nd part of the range (if any) starts here. */ - start = temp_end; - - /* We do a continue, instead of dropping down, because even if - * the 2nd part is non-empty, it could be so short that we want - * to output it as individual characters, as tested for at the - * top of this loop. */ - continue; - } - - /* Here, 'start' is a printable ASCII. If it is an alphanumeric, - * output a sub-range of just the digits or letters, then process - * the remaining portion as usual. */ - if (isALPHANUMERIC_A(start)) { - UV mask = (isDIGIT_A(start)) - ? CC_DIGIT_ - : isUPPER_A(start) - ? CC_UPPER_ - : CC_LOWER_; - UV temp_end = start + 1; - - /* Find the end of the sub-range that includes just the - * characters in the same class as the first character in it */ - while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { - temp_end++; - } - temp_end--; - - /* For short ranges, don't duplicate the code above to output - * them; just call recursively */ - if (temp_end - start < min_range_count) { - put_range(sv, start, temp_end, FALSE); - } - else { /* Output as a range */ - put_code_point(sv, start); - sv_catpvs(sv, "-"); - put_code_point(sv, temp_end); - } - start = temp_end + 1; - continue; - } - - /* We output any other printables as individual characters */ - if (isPUNCT_A(start) || isSPACE_A(start)) { - while (start <= end && (isPUNCT_A(start) - || isSPACE_A(start))) - { - put_code_point(sv, start); - start++; - } - continue; - } - } /* End of looking for literals */ - - /* Here is not to output as a literal. Some control characters have - * mnemonic names. Split off any of those at the beginning and end of - * the range to print mnemonically. It isn't possible for many of - * these to be in a row, so this won't overwhelm with output */ - if ( start <= end - && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) - { - while (isMNEMONIC_CNTRL(start) && start <= end) { - put_code_point(sv, start); - start++; - } - - /* If this didn't take care of the whole range ... */ - if (start <= end) { - - /* Look backwards from the end to find the final non-mnemonic - * */ - UV temp_end = end; - while (isMNEMONIC_CNTRL(temp_end)) { - temp_end--; - } - - /* And separately output the interior range that doesn't start - * or end with mnemonics */ - put_range(sv, start, temp_end, FALSE); - - /* Then output the mnemonic trailing controls */ - start = temp_end + 1; - while (start <= end) { - put_code_point(sv, start); - start++; - } - break; - } - } - - /* As a final resort, output the range or subrange as hex. */ - - if (start >= NUM_ANYOF_CODE_POINTS) { - this_end = end; - } - else { /* Have to split range at the bitmap boundary */ - this_end = (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1; - } -#if NUM_ANYOF_CODE_POINTS > 256 - format = (this_end < 256) - ? "\\x%02" UVXf "-\\x%02" UVXf - : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; -#else - format = "\\x%02" UVXf "-\\x%02" UVXf; -#endif - GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); - Perl_sv_catpvf(aTHX_ sv, format, start, this_end); - GCC_DIAG_RESTORE_STMT; - break; - } -} - -STATIC void -S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) -{ - /* Concatenate onto the PV in 'sv' a displayable form of the inversion list - * 'invlist' */ - - UV start, end; - bool allow_literals = TRUE; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; - - /* Generally, it is more readable if printable characters are output as - * literals, but if a range (nearly) spans all of them, it's best to output - * it as a single range. This code will use a single range if all but 2 - * ASCII printables are in it */ - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - - /* If the range starts beyond the final printable, it doesn't have any - * in it */ - if (start > MAX_PRINT_A) { - break; - } - - /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span - * all but two, the range must start and end no later than 2 from - * either end */ - if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { - if (end > MAX_PRINT_A) { - end = MAX_PRINT_A; - } - if (start < ' ') { - start = ' '; - } - if (end - start >= MAX_PRINT_A - ' ' - 2) { - allow_literals = FALSE; - } - break; - } - } - invlist_iterfinish(invlist); - - /* Here we have figured things out. Output each range */ - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (start >= NUM_ANYOF_CODE_POINTS) { - break; - } - put_range(sv, start, end, allow_literals); - } - invlist_iterfinish(invlist); - - return; -} - -STATIC SV* -S_put_charclass_bitmap_innards_common(pTHX_ - SV* invlist, /* The bitmap */ - SV* posixes, /* Under /l, things like [:word:], \S */ - SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ - SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ - SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ - const bool invert /* Is the result to be inverted? */ -) -{ - /* Create and return an SV containing a displayable version of the bitmap - * and associated information determined by the input parameters. If the - * output would have been only the inversion indicator '^', NULL is instead - * returned. */ - - SV * output; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; - - if (invert) { - output = newSVpvs("^"); - } - else { - output = newSVpvs(""); - } - - /* First, the code points in the bitmap that are unconditionally there */ - put_charclass_bitmap_innards_invlist(output, invlist); - - /* Traditionally, these have been placed after the main code points */ - if (posixes) { - sv_catsv(output, posixes); - } - - if (only_utf8 && _invlist_len(only_utf8)) { - Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, only_utf8); - } - - if (not_utf8 && _invlist_len(not_utf8)) { - Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, not_utf8); - } - - if (only_utf8_locale && _invlist_len(only_utf8_locale)) { - Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, only_utf8_locale); - - /* This is the only list in this routine that can legally contain code - * points outside the bitmap range. The call just above to - * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so - * output them here. There's about a half-dozen possible, and none in - * contiguous ranges longer than 2 */ - if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { - UV start, end; - SV* above_bitmap = NULL; - - _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); - - invlist_iterinit(above_bitmap); - while (invlist_iternext(above_bitmap, &start, &end)) { - UV i; - - for (i = start; i <= end; i++) { - put_code_point(output, i); - } - } - invlist_iterfinish(above_bitmap); - SvREFCNT_dec_NN(above_bitmap); - } - } - - if (invert && SvCUR(output) == 1) { - return NULL; - } - - return output; -} - -STATIC U8 -S_put_charclass_bitmap_innards(pTHX_ SV *sv, - char *bitmap, - SV *nonbitmap_invlist, - SV *only_utf8_locale_invlist, - const regnode * const node, - const U8 flags, - const bool force_as_is_display) -{ - /* Appends to 'sv' a displayable version of the innards of the bracketed - * character class defined by the other arguments: - * 'bitmap' points to the bitmap, or NULL if to ignore that. - * 'nonbitmap_invlist' is an inversion list of the code points that are in - * the bitmap range, but for some reason aren't in the bitmap; NULL if - * none. The reasons for this could be that they require some - * condition such as the target string being or not being in UTF-8 - * (under /d), or because they came from a user-defined property that - * was not resolved at the time of the regex compilation (under /u) - * 'only_utf8_locale_invlist' is an inversion list of the code points that - * are valid only if the runtime locale is a UTF-8 one; NULL if none - * 'node' is the regex pattern ANYOF node. It is needed only when the - * above two parameters are not null, and is passed so that this - * routine can tease apart the various reasons for them. - * 'flags' is the flags field of 'node' - * 'force_as_is_display' is TRUE if this routine should definitely NOT try - * to invert things to see if that leads to a cleaner display. If - * FALSE, this routine is free to use its judgment about doing this. - * - * It returns 0 if nothing was actually output. (It may be that - * the bitmap, etc is empty.) - * 1 if the output wasn't inverted (didn't begin with a '^') - * 2 if the output was inverted (did begin with a '^') - * - * When called for outputting the bitmap of a non-ANYOF node, just pass the - * bitmap, with the succeeding parameters set to NULL, and the final one to - * FALSE. - */ - - /* In general, it tries to display the 'cleanest' representation of the - * innards, choosing whether to display them inverted or not, regardless of - * whether the class itself is to be inverted. However, there are some - * cases where it can't try inverting, as what actually matches isn't known - * until runtime, and hence the inversion isn't either. */ - - bool inverting_allowed = ! force_as_is_display; - - int i; - STRLEN orig_sv_cur = SvCUR(sv); - - SV* invlist; /* Inversion list we accumulate of code points that - are unconditionally matched */ - SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is - UTF-8 */ - SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 - */ - SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ - SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale - is UTF-8 */ - - SV* as_is_display; /* The output string when we take the inputs - literally */ - SV* inverted_display; /* The output string when we invert the inputs */ - - bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted - to match? */ - /* We are biased in favor of displaying things without them being inverted, - * as that is generally easier to understand */ - const int bias = 5; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; - - /* Start off with whatever code points are passed in. (We clone, so we - * don't change the caller's list) */ - if (nonbitmap_invlist) { - assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); - invlist = invlist_clone(nonbitmap_invlist, NULL); - } - else { /* Worst case size is every other code point is matched */ - invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); - } - - if (flags) { - if (OP(node) == ANYOFD) { - - /* This flag indicates that the code points below 0x100 in the - * nonbitmap list are precisely the ones that match only when the - * target is UTF-8 (they should all be non-ASCII). */ - if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { - _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); - _invlist_subtract(invlist, only_utf8, &invlist); - } - - /* And this flag for matching all non-ASCII 0xFF and below */ - if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { - not_utf8 = invlist_clone(PL_UpperLatin1, NULL); - } - } - else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { - - /* If either of these flags are set, what matches isn't - * determinable except during execution, so don't know enough here - * to invert */ - if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { - inverting_allowed = FALSE; - } - - /* What the posix classes match also varies at runtime, so these - * will be output symbolically. */ - if (ANYOF_POSIXL_TEST_ANY_SET(node)) { - int i; - - posixes = newSVpvs(""); - for (i = 0; i < ANYOF_POSIXL_MAX; i++) { - if (ANYOF_POSIXL_TEST(node, i)) { - sv_catpv(posixes, anyofs[i]); - } - } - } - } - } - - /* Accumulate the bit map into the unconditional match list */ - if (bitmap) { - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST(bitmap, i)) { - int start = i++; - for (; - i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); - i++) - { /* empty */ } - invlist = _add_range_to_invlist(invlist, start, i-1); - } - } - } - - /* Make sure that the conditional match lists don't have anything in them - * that match unconditionally; otherwise the output is quite confusing. - * This could happen if the code that populates these misses some - * duplication. */ - if (only_utf8) { - _invlist_subtract(only_utf8, invlist, &only_utf8); - } - if (not_utf8) { - _invlist_subtract(not_utf8, invlist, ¬_utf8); - } - - if (only_utf8_locale_invlist) { - - /* Since this list is passed in, we have to make a copy before - * modifying it */ - only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); - - _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); - - /* And, it can get really weird for us to try outputting an inverted - * form of this list when it has things above the bitmap, so don't even - * try */ - if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { - inverting_allowed = FALSE; - } - } - - /* Calculate what the output would be if we take the input as-is */ - as_is_display = put_charclass_bitmap_innards_common(invlist, - posixes, - only_utf8, - not_utf8, - only_utf8_locale, - invert); - - /* If have to take the output as-is, just do that */ - if (! inverting_allowed) { - if (as_is_display) { - sv_catsv(sv, as_is_display); - SvREFCNT_dec_NN(as_is_display); - } - } - else { /* But otherwise, create the output again on the inverted input, and - use whichever version is shorter */ - - int inverted_bias, as_is_bias; - - /* We will apply our bias to whichever of the results doesn't have - * the '^' */ - bool trial_invert; - if (invert) { - trial_invert = FALSE; - as_is_bias = bias; - inverted_bias = 0; - } - else { - trial_invert = TRUE; - as_is_bias = 0; - inverted_bias = bias; - } - - /* Now invert each of the lists that contribute to the output, - * excluding from the result things outside the possible range */ - - /* For the unconditional inversion list, we have to add in all the - * conditional code points, so that when inverted, they will be gone - * from it */ - _invlist_union(only_utf8, invlist, &invlist); - _invlist_union(not_utf8, invlist, &invlist); - _invlist_union(only_utf8_locale, invlist, &invlist); - _invlist_invert(invlist); - _invlist_intersection(invlist, PL_InBitmap, &invlist); - - if (only_utf8) { - _invlist_invert(only_utf8); - _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); - } - else if (not_utf8) { - - /* If a code point matches iff the target string is not in UTF-8, - * then complementing the result has it not match iff not in UTF-8, - * which is the same thing as matching iff it is UTF-8. */ - only_utf8 = not_utf8; - not_utf8 = NULL; - } - - if (only_utf8_locale) { - _invlist_invert(only_utf8_locale); - _invlist_intersection(only_utf8_locale, - PL_InBitmap, - &only_utf8_locale); - } - - inverted_display = put_charclass_bitmap_innards_common( - invlist, - posixes, - only_utf8, - not_utf8, - only_utf8_locale, trial_invert); - - /* Use the shortest representation, taking into account our bias - * against showing it inverted */ - if ( inverted_display - && ( ! as_is_display - || ( SvCUR(inverted_display) + inverted_bias - < SvCUR(as_is_display) + as_is_bias))) - { - sv_catsv(sv, inverted_display); - invert = ! invert; - } - else if (as_is_display) { - sv_catsv(sv, as_is_display); - } - - SvREFCNT_dec(as_is_display); - SvREFCNT_dec(inverted_display); - } - - SvREFCNT_dec_NN(invlist); - SvREFCNT_dec(only_utf8); - SvREFCNT_dec(not_utf8); - SvREFCNT_dec(posixes); - SvREFCNT_dec(only_utf8_locale); - - U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); - if (did_output_something) { - /* Distinguish between non and inverted cases */ - did_output_something += invert; - } - - return did_output_something; -} - -#define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ - " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ - } STMT_END - -#define DUMPUNTIL(b,e) \ - CLEAR_OPTSTART; \ - node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); - -STATIC const regnode * -S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, const regnode *plast, - SV* sv, I32 indent, U32 depth) -{ - const regnode *next; - const regnode *optstart= NULL; - - RXi_GET_DECL(r, ri); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMPUNTIL; - -#ifdef DEBUG_DUMPUNTIL - Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, - last ? last-start : 0, plast ? plast-start : 0); -#endif - - if (plast && plast < last) - last= plast; - - while (node && (!last || node < last)) { - const U8 op = OP(node); - - if (op == CLOSE || op == SRCLOSE || op == WHILEM) - indent--; - next = regnext((regnode *)node); - const regnode *after = regnode_after((regnode *)node,0); - - /* Where, what. */ - if (op == OPTIMIZED) { - if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) - optstart = node; - else - goto after_print; - } else - CLEAR_OPTSTART; - - regprop(r, sv, node, NULL, NULL); - Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), - (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (op != OPTIMIZED) { - if (next == NULL) /* Next ptr. */ - Perl_re_printf( aTHX_ " (0)"); - else if (REGNODE_TYPE(op) == BRANCH - && REGNODE_TYPE(OP(next)) != BRANCH ) - Perl_re_printf( aTHX_ " (FAIL)"); - else - Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); - Perl_re_printf( aTHX_ "\n"); - } - - after_print: - if (REGNODE_TYPE(op) == BRANCHJ) { - assert(next); - const regnode *nnode = (OP(next) == LONGJMP - ? regnext((regnode *)next) - : next); - if (last && nnode > last) - nnode = last; - DUMPUNTIL(after, nnode); - } - else if (REGNODE_TYPE(op) == BRANCH) { - assert(next); - DUMPUNTIL(after, next); - } - else if ( REGNODE_TYPE(op) == TRIE ) { - const regnode *this_trie = node; - const U32 n = ARG(node); - const reg_ac_data * const ac = op>=AHOCORASICK ? - (reg_ac_data *)ri->data->data[n] : - NULL; - const reg_trie_data * const trie = - (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; -#ifdef DEBUGGING - AV *const trie_words - = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); -#endif - const regnode *nextbranch= NULL; - I32 word_idx; - SvPVCLEAR(sv); - for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { - SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); - - Perl_re_indentf( aTHX_ "%s ", - indent+3, - elem_ptr - ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), - SvCUR(*elem_ptr), PL_dump_re_max_len, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) - ? PERL_PV_ESCAPE_UNI - : 0) - | PERL_PV_PRETTY_ELLIPSES - | PERL_PV_PRETTY_LTGT - ) - : "???" - ); - if (trie->jump) { - U16 dist= trie->jump[word_idx+1]; - Perl_re_printf( aTHX_ "(%" UVuf ")\n", - (UV)((dist ? this_trie + dist : next) - start)); - if (dist) { - if (!nextbranch) - nextbranch= this_trie + trie->jump[0]; - DUMPUNTIL(this_trie + dist, nextbranch); - } - if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) - nextbranch= regnext((regnode *)nextbranch); - } else { - Perl_re_printf( aTHX_ "\n"); - } - } - if (last && next > last) - node= last; - else - node= next; - } - else if ( op == CURLY ) { /* "next" might be very big: optimizer */ - DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ - } - else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { - assert(next); - DUMPUNTIL(after, next); - } - else if ( op == PLUS || op == STAR) { - DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ - } - else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { - /* Literal string, where present. */ - node = (const regnode *)REGNODE_AFTER_varies(node); - } - else { - node = REGNODE_AFTER_opcode(node,op); - } - if (op == CURLYX || op == OPEN || op == SROPEN) - indent++; - if (REGNODE_TYPE(op) == END) - break; - } - CLEAR_OPTSTART; -#ifdef DEBUG_DUMPUNTIL - Perl_re_printf( aTHX_ "--- %d\n", (int)indent); -#endif - return node; -} - -#endif /* DEBUGGING */ - #ifndef PERL_IN_XSUB_RE # include "uni_keywords.h" @@ -23939,7 +13666,7 @@ warning when none was present before might cause breakage, for little gain. So khw left this code in, but not enabled. Tests were never added. embed.fnc entry: -Ei |const char *|get_extended_utf8_msg|const UV cp +Ei |const char *|get_extended_utf8_msg|const UV cp PERL_STATIC_INLINE const char * S_get_extended_utf8_msg(pTHX_ const UV cp) |