diff options
-rw-r--r-- | MANIFEST | 5 | ||||
-rwxr-xr-x | Makefile.SH | 29 | ||||
-rw-r--r-- | Makefile.micro | 17 | ||||
-rw-r--r-- | embed.fnc | 267 | ||||
-rw-r--r-- | embed.h | 212 | ||||
-rw-r--r-- | ext/re/Makefile.PL | 69 | ||||
-rw-r--r-- | handy.h | 2 | ||||
-rw-r--r-- | inline.h | 2 | ||||
-rw-r--r-- | invlist_inline.h | 100 | ||||
-rw-r--r-- | perl.h | 4 | ||||
-rw-r--r-- | proto.h | 675 | ||||
-rw-r--r-- | regcomp.c | 10477 | ||||
-rw-r--r-- | regcomp.h | 192 | ||||
-rw-r--r-- | regcomp.sym | 10 | ||||
-rw-r--r-- | regcomp_debug.c | 1625 | ||||
-rw-r--r-- | regcomp_internal.h | 1196 | ||||
-rw-r--r-- | regcomp_invlist.c | 1540 | ||||
-rw-r--r-- | regcomp_study.c | 3808 | ||||
-rw-r--r-- | regcomp_trie.c | 1688 | ||||
-rw-r--r-- | regen/unicode_constants.pl | 4 | ||||
-rw-r--r-- | regexec.c | 570 | ||||
-rw-r--r-- | regexp.h | 154 | ||||
-rw-r--r-- | unicode_constants.h | 8 | ||||
-rw-r--r-- | vms/descrip_mms.template | 24 | ||||
-rw-r--r-- | win32/GNUmakefile | 16 | ||||
-rw-r--r-- | win32/Makefile | 15 |
26 files changed, 11496 insertions, 11213 deletions
@@ -5513,6 +5513,11 @@ regcharclass.h Generated by regen/regcharclass.pl regcomp.c Regular expression compiler regcomp.h Private declarations for above regcomp.sym Data for regnodes.h +regcomp_debug.c Regular expression compiler debug code +regcomp_internal.h Internal stuff for regex compiler +regcomp_invlist.c Invlist logic for regular expresion engine +regcomp_study.c Optimizer for regular expresion compiler +regcomp_trie.c Trie logic for regular expresion compiler regen.pl Run all scripts that (re)generate files regen/charset_translations.pl Character set utilities regen/ebcdic.pl Generates ebcdic_tables.h diff --git a/Makefile.SH b/Makefile.SH index 20f6efed4e..6b9f48f463 100755 --- a/Makefile.SH +++ b/Makefile.SH @@ -530,17 +530,19 @@ unidatadirs = lib/unicore/To lib/unicore/lib h1 = EXTERN.h INTERN.h XSUB.h av.h $(CONFIGH) cop.h cv.h dosish.h h2 = embed.h form.h gv.h handy.h hv.h hv_func.h keywords.h mg.h op.h opcode.h -h3 = pad.h patchlevel.h perl.h perlapi.h perly.h pp.h proto.h regcomp.h +h3 = pad.h patchlevel.h perl.h perlapi.h perly.h pp.h proto.h regcomp.h regcomp_internal.h h4 = regexp.h scope.h sv.h unixish.h util.h iperlsys.h thread.h h5 = utf8.h warnings.h mydtrace.h op_reg_common.h l1_char_class_tab.h h6 = charclass_invlists.h h = $(h1) $(h2) $(h3) $(h4) $(h5) $(h6) c1 = av.c scope.c op.c peep.c doop.c doio.c dump.c gv.c hv.c mg.c reentr.c mro_core.c perl.c -c2 = perly.c pp.c pp_hot.c pp_ctl.c pp_sys.c regcomp.c regexec.c utf8.c sv.c +c2 = perly.c pp.c pp_hot.c pp_ctl.c pp_sys.c utf8.c sv.c c3 = taint.c toke.c util.c deb.c run.c builtin.c universal.c pad.c globals.c keywords.c c4 = perlio.c numeric.c mathoms.c locale.c pp_pack.c pp_sort.c caretx.c dquote.c time64.c -c5 = $(mallocsrc) +c5 = regcomp.c regcomp_debug.c regcomp_invlist.c regcomp_study.c regcomp_trie.c regexec.c +c6 = $(mallocsrc) +c_base = $(c1) $(c2) $(c3) $(c4) $(c5) $(c6) !NO!SUBS! @@ -552,17 +554,24 @@ main_only_objs =$main_only_objs $spitshell >>$Makefile <<'!NO!SUBS!' -c = $(c1) $(c2) $(c3) $(c4) $(c5) miniperlmain.c $(mini_only_src) +c = $(c_base) miniperlmain.c $(mini_only_src) -obj1 = $(mallocobj) gv$(OBJ_EXT) toke$(OBJ_EXT) perly$(OBJ_EXT) pad$(OBJ_EXT) regcomp$(OBJ_EXT) dump$(OBJ_EXT) util$(OBJ_EXT) mg$(OBJ_EXT) reentr$(OBJ_EXT) mro_core$(OBJ_EXT) keywords$(OBJ_EXT) builtin$(OBJ_EXT) -obj2 = hv$(OBJ_EXT) av$(OBJ_EXT) run$(OBJ_EXT) pp_hot$(OBJ_EXT) sv$(OBJ_EXT) pp$(OBJ_EXT) scope$(OBJ_EXT) pp_ctl$(OBJ_EXT) pp_sys$(OBJ_EXT) peep$(OBJ_EXT) -obj3 = doop$(OBJ_EXT) doio$(OBJ_EXT) regexec$(OBJ_EXT) utf8$(OBJ_EXT) taint$(OBJ_EXT) deb$(OBJ_EXT) globals$(OBJ_EXT) perlio$(OBJ_EXT) numeric$(OBJ_EXT) mathoms$(OBJ_EXT) locale$(OBJ_EXT) pp_pack$(OBJ_EXT) pp_sort$(OBJ_EXT) caretx$(OBJ_EXT) dquote$(OBJ_EXT) time64$(OBJ_EXT) +obj1 = $(mallocobj) gv$(OBJ_EXT) toke$(OBJ_EXT) perly$(OBJ_EXT) pad$(OBJ_EXT) +obj2 = regcomp$(OBJ_EXT) regcomp_debug$(OBJ_EXT) regcomp_invlist$(OBJ_EXT) regcomp_study$(OBJ_EXT) regcomp_trie$(OBJ_EXT) +obj3 = regexec$(OBJ_EXT) dump$(OBJ_EXT) util$(OBJ_EXT) mg$(OBJ_EXT) reentr$(OBJ_EXT) mro_core$(OBJ_EXT) +obj4 = keywords$(OBJ_EXT) builtin$(OBJ_EXT) +obj5 = hv$(OBJ_EXT) av$(OBJ_EXT) run$(OBJ_EXT) pp_hot$(OBJ_EXT) sv$(OBJ_EXT) pp$(OBJ_EXT) +obj6 = scope$(OBJ_EXT) pp_ctl$(OBJ_EXT) pp_sys$(OBJ_EXT) peep$(OBJ_EXT) +obj7 = doop$(OBJ_EXT) doio$(OBJ_EXT) utf8$(OBJ_EXT) taint$(OBJ_EXT) +obj8 = deb$(OBJ_EXT) globals$(OBJ_EXT) perlio$(OBJ_EXT) numeric$(OBJ_EXT) mathoms$(OBJ_EXT) +obj9 = locale$(OBJ_EXT) pp_pack$(OBJ_EXT) pp_sort$(OBJ_EXT) caretx$(OBJ_EXT) dquote$(OBJ_EXT) +obj10 = time64$(OBJ_EXT) # split the objects into 3 exclusive sets: those used by both miniperl and # perl, and those used by just one or the other. Doesn't include the # actual perl(mini)main.o, nor any dtrace objects. -common_objs = $(obj1) $(obj2) $(obj3) $(ARCHOBJS) +common_objs = $(obj1) $(obj2) $(obj3) $(obj4) $(obj5) $(obj6) $(obj7) $(obj8) $(obj9) $(obj10) $(ARCHOBJS) miniperl_objs_nodt = $(mini_only_objs) $(common_objs) miniperlmain$(OBJ_EXT) perllib_objs_nodt = $(main_only_objs) $(common_objs) @@ -1722,8 +1731,8 @@ distcheck: FORCE .PHONY: ctags -TAGS: $(c1) $(c2) $(c3) $(c4) $(c5) $(h) - etags $(c1) $(c2) $(c3) $(c4) $(c5) $(h) +TAGS: $(c_base) $(h) + etags $(c_base) $(h) !NO!SUBS! $spitshell >>$Makefile <<!GROK!THIS! diff --git a/Makefile.micro b/Makefile.micro index 89b52faf8c..75f9333a2a 100644 --- a/Makefile.micro +++ b/Makefile.micro @@ -18,7 +18,8 @@ O = uav$(_O) udeb$(_O) udoio$(_O) udoop$(_O) udquote$(_O) udump$(_O) \ umg$(_O) uperlmain$(_O) uop$(_O) ureentr$(_O) \ upad$(_O) uperl$(_O) uperlio$(_O) uperly$(_O) upp$(_O) \ upp_ctl$(_O) upp_hot$(_O) upp_sys$(_O) upp_pack$(_O) upp_sort$(_O) \ - uregcomp$(_O) uregexec$(_O) urun$(_O) \ + uregcomp$(_O) uregcomp_debug$(_O) uregcomp_invlist$(_O) \ + uregcomp_study$(_O) uregcomp_trie$(_O) uregexec$(_O) urun$(_O) \ uscope$(_O) usv$(_O) utaint$(_O) utime64$(_O) utoke$(_O) \ unumeric$(_O) ulocale$(_O) umathoms$(_O) \ uuniversal$(_O) uutf8$(_O) uutil$(_O) ukeywords$(_O) @@ -138,7 +139,19 @@ upp_pack$(_O): $(HE) pp_pack.c upp_sort$(_O): $(HE) pp_sort.c $(CC) $(CCFLAGS) -o $@ $(CFLAGS) pp_sort.c -uregcomp$(_O): $(HE) regcomp.c regcomp.h regnodes.h INTERN.h +uregcomp$(_O): $(HE) regcomp.c regcomp_internal.h regcomp.h regnodes.h INTERN.h + $(CC) $(CCFLAGS) -o $@ $(CFLAGS) regcomp.c + +uregcomp_debug$(_O): $(HE) regcomp_debug.c regcomp_internal.h regcomp.h regnodes.h INTERN.h + $(CC) $(CCFLAGS) -o $@ $(CFLAGS) regcomp.c + +uregcomp_invlist$(_O): $(HE) regcomp_invlist.c regcomp_internal.h regcomp.h regnodes.h INTERN.h + $(CC) $(CCFLAGS) -o $@ $(CFLAGS) regcomp.c + +uregcomp_study$(_O): $(HE) regcomp_study.c regcomp_internal.h regcomp.h regnodes.h INTERN.h + $(CC) $(CCFLAGS) -o $@ $(CFLAGS) regcomp.c + +uregcomp_trie$(_O): $(HE) regcomp_trie.c regcomp_internal.h regcomp.h regnodes.h INTERN.h $(CC) $(CCFLAGS) -o $@ $(CFLAGS) regcomp.c uregexec$(_O): $(HE) regexec.c regcomp.h regnodes.h @@ -2040,7 +2040,7 @@ Cp |I32 |regexec_flags |NN REGEXP *const rx|NN char *stringarg \ |NN char *strend|NN char *strbeg \ |SSize_t minend|NN SV *sv \ |NULLOK void *data|U32 flags -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#if defined(PERL_IN_REGEX_ENGINE) CipR |regnode*|regnext |NULLOK const regnode* p CipR |bool|check_regnode_after |NULLOK const regnode* p|const STRLEN extra CipR |regnode*|regnode_after |NULLOK const regnode* p|bool varies @@ -2069,14 +2069,16 @@ EXpRT |I16 |do_uniprop_match|NN const char * const key|const U16 key_len EXpRT |const char * const *|get_prop_values|const int table_index EXpR |SV * |get_prop_definition|const int table_index EXpRT |const char *|get_deprecated_property_msg|const Size_t warning_offset -#if defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_REGCOMP_ANY) EiRT |bool |invlist_is_iterating|NN const SV* const invlist EiR |SV* |invlist_contents|NN SV* const invlist \ |const bool traditional_style EixRT |UV |invlist_lowest|NN SV* const invlist +#endif +#if defined(PERL_IN_REGCOMP_ANY) EixRT |UV |invlist_highest_range_start|NN SV* const invlist -ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \ - |NN regnode *node +#endif +#if defined(PERL_IN_REGCOMP_C) ES |regnode_offset|reg_la_NOTHING |NN RExC_state_t *pRExC_state \ |U32 flags|NN const char *type ES |regnode_offset|reg_la_OPFAIL |NN RExC_state_t *pRExC_state \ @@ -2103,11 +2105,6 @@ ES |regnode_offset|regatom |NN RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth ES |regnode_offset|regbranch |NN RExC_state_t *pRExC_state \ |NN I32 *flagp|I32 first|U32 depth -ES |void |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \ - |NN regnode* const node \ - |NULLOK SV* const cp_list \ - |NULLOK SV* const runtime_defns \ - |NULLOK SV* const only_utf8_locale_list ES |void |output_posix_warnings \ |NN RExC_state_t *pRExC_state \ |NN AV* posix_warnings @@ -2162,8 +2159,6 @@ ES |bool |handle_names_wildcard \ |const STRLEN wname_len \ |NN SV ** prop_definition \ |NN AV ** strings -ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ - |NN SV** invlist ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \ |NN I32 *flagp \ |NN char * backref_parse_start \ @@ -2202,25 +2197,83 @@ ESR |bool |regtail |NN RExC_state_t * pRExC_state \ |const U32 depth ES |SV * |reg_scan_name |NN RExC_state_t *pRExC_state \ |U32 flags -ES |U32 |join_exact |NN RExC_state_t *pRExC_state \ - |NN regnode *scan|NN UV *min_subtract \ - |NN bool *unfolded_multi_char \ - |U32 flags|NULLOK regnode *val|U32 depth EST |U8 |compute_EXACTish|NN RExC_state_t *pRExC_state ES |void |nextchar |NN RExC_state_t *pRExC_state ES |void |skip_to_be_ignored_text|NN RExC_state_t *pRExC_state \ |NN char ** p \ |const bool force_to_xmod EiT |char * |reg_skipcomment|NN RExC_state_t *pRExC_state|NN char * p -ES |void |scan_commit |NN const RExC_state_t *pRExC_state \ +frS |void |re_croak |bool utf8|NN const char* pat|... +ES |int |handle_possible_posix \ + |NN RExC_state_t *pRExC_state \ + |NN const char* const s \ + |NULLOK char ** updated_parse_ptr \ + |NULLOK AV** posix_warnings \ + |const bool check_only +ETSR |int |edit_distance |NN const UV *src \ + |NN const UV *tgt \ + |const STRLEN x \ + |const STRLEN y \ + |const SSize_t maxDistance +# ifdef DEBUGGING +ESR |bool |regtail_study |NN RExC_state_t *pRExC_state \ + |NN regnode_offset p|NN const regnode_offset val|U32 depth +# endif +#endif +#if defined(PERL_IN_REGCOMP_INVLIST_C) +# ifndef PERL_EXT_RE_BUILD +EiRT |UV* |_invlist_array_init |NN SV* const invlist|const bool will_have_0 +EiRT |UV |invlist_max |NN const SV* const invlist +EiRT |IV* |get_invlist_previous_index_addr|NN SV* invlist +EiT |void |invlist_set_previous_index|NN SV* const invlist|const IV index +EiRT |IV |invlist_previous_index|NN SV* const invlist +EiT |void |invlist_trim |NN SV* invlist +Ei |void |invlist_clear |NN SV* invlist +ES |void |_append_range_to_invlist |NN SV* const invlist|const UV start|const UV end +ES |void |invlist_replace_list_destroys_src|NN SV *dest|NN SV *src +S |void |initialize_invlist_guts|NN SV* invlist|const Size_t initial_size +# endif +#endif +#if defined(PERL_IN_REGCOMP_ANY) +EpR |SV * |get_ANYOFM_contents|NN const regnode * n +EpR |SV * |get_ANYOFHbbm_contents|NN const regnode * n +Ep |void |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \ + |NN regnode* const node \ + |NULLOK SV* const cp_list \ + |NULLOK SV* const runtime_defns \ + |NULLOK SV* const only_utf8_locale_list +Ep |void |populate_anyof_bitmap_from_invlist|NN regnode *node|NN SV** invlist_ptr +Ep |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ + |NN SV** invlist +Ep |SSize_t|study_chunk |NN RExC_state_t *pRExC_state \ + |NN regnode **scanp|NN SSize_t *minlenp \ + |NN SSize_t *deltap|NN regnode *last \ + |NULLOK struct scan_data_t *data \ + |I32 stopparen|U32 recursed_depth \ + |NULLOK regnode_ssc *and_withp \ + |U32 flags|U32 depth|bool was_mutate_ok +Ep |void |scan_commit |NN const RExC_state_t *pRExC_state \ |NN struct scan_data_t *data \ |NN SSize_t *minlenp \ |int is_inf -ES |void |populate_anyof_bitmap_from_invlist|NN regnode *node|NN SV** invlist_ptr +Ep |void |ssc_init |NN const RExC_state_t *pRExC_state \ + |NN regnode_ssc *ssc +ETp |bool |is_ssc_worth_it|NN const RExC_state_t * pRExC_state \ + |NN const regnode_ssc * ssc +Ep |void |ssc_finalize |NN RExC_state_t *pRExC_state \ + |NN regnode_ssc *ssc +Ep |U32 |join_exact |NN RExC_state_t *pRExC_state \ + |NN regnode *scan|NN UV *min_subtract \ + |NN bool *unfolded_multi_char \ + |U32 flags|NULLOK regnode *val|U32 depth +#endif +#if defined(PERL_IN_REGCOMP_STUDY_C) +ES |void |unwind_scan_frames|NN const void *p +ES |void |rck_elide_nothing|NN regnode *node +ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \ + |NN regnode *node ES |void |ssc_anything |NN regnode_ssc *ssc ESRT |int |ssc_is_anything|NN const regnode_ssc *ssc -ES |void |ssc_init |NN const RExC_state_t *pRExC_state \ - |NN regnode_ssc *ssc ESRT |int |ssc_is_cp_posixl_init|NN const RExC_state_t *pRExC_state \ |NN const regnode_ssc *ssc ES |void |ssc_and |NN const RExC_state_t *pRExC_state \ @@ -2241,111 +2294,30 @@ ES |void |ssc_add_range |NN regnode_ssc *ssc \ ES |void |ssc_cp_and |NN regnode_ssc *ssc \ |UV const cp EST |void |ssc_clear_locale|NN regnode_ssc *ssc -ETS |bool |is_ssc_worth_it|NN const RExC_state_t * pRExC_state \ - |NN const regnode_ssc * ssc -ES |void |ssc_finalize |NN RExC_state_t *pRExC_state \ - |NN regnode_ssc *ssc -ES |SSize_t|study_chunk |NN RExC_state_t *pRExC_state \ - |NN regnode **scanp|NN SSize_t *minlenp \ - |NN SSize_t *deltap|NN regnode *last \ - |NULLOK struct scan_data_t *data \ - |I32 stopparen|U32 recursed_depth \ - |NULLOK regnode_ssc *and_withp \ - |U32 flags|U32 depth|bool was_mutate_ok -ES |void |rck_elide_nothing|NN regnode *node -ESR |SV * |get_ANYOFM_contents|NN const regnode * n -ESR |SV * |get_ANYOFHbbm_contents|NN const regnode * n -ES |void |populate_bitmap_from_invlist \ +#endif +#if defined(PERL_IN_REGCOMP_INVLIST_C) || defined(PERL_IN_REGCOMP_C) +Ep |void |populate_bitmap_from_invlist \ |NN SV * invlist \ |const UV offset \ |NN const U8 * bitmap \ |const Size_t len -ES |void |populate_invlist_from_bitmap \ +Ep |void |populate_invlist_from_bitmap \ |NN const U8 * bitmap \ |const Size_t bitmap_len \ |NN SV ** invlist \ |const UV offset -ESRT |U32 |add_data |NN RExC_state_t* const pRExC_state \ +#endif +#if defined(PERL_IN_REGCOMP_ANY) +EpRT |U32 |reg_add_data |NN RExC_state_t* const pRExC_state \ |NN const char* const s|const U32 n -frS |void |re_croak |bool utf8|NN const char* pat|... -ES |int |handle_possible_posix \ - |NN RExC_state_t *pRExC_state \ - |NN const char* const s \ - |NULLOK char ** updated_parse_ptr \ - |NULLOK AV** posix_warnings \ - |const bool check_only -ES |I32 |make_trie |NN RExC_state_t *pRExC_state \ - |NN regnode *startbranch|NN regnode *first \ - |NN regnode *last|NN regnode *tail \ - |U32 word_count|U32 flags|U32 depth -ES |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \ - |NN regnode *source|U32 depth -ETSR |int |edit_distance |NN const UV *src \ - |NN const UV *tgt \ - |const STRLEN x \ - |const STRLEN y \ - |const SSize_t maxDistance -# ifdef DEBUGGING -EFp |int |re_indentf |NN const char *fmt|U32 depth|... -ES |void |regdump_intflags|NULLOK const char *lead| const U32 flags -ES |void |regdump_extflags|NULLOK const char *lead| const U32 flags -ES |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \ - |NN const regnode *node \ - |NULLOK const regnode *last \ - |NULLOK const regnode *plast \ - |NN SV* sv|I32 indent|U32 depth -ES |void |put_code_point |NN SV* sv|UV c -ES |U8 |put_charclass_bitmap_innards|NN SV* sv \ - |NULLOK char* bitmap \ - |NULLOK SV* nonbitmap_invlist \ - |NULLOK SV* only_utf8_locale_invlist\ - |NULLOK const regnode * const node \ - |const U8 flags \ - |const bool force_as_is_display -ES |SV* |put_charclass_bitmap_innards_common \ - |NN SV* invlist \ - |NULLOK SV* posixes \ - |NULLOK SV* only_utf8 \ - |NULLOK SV* not_utf8 \ - |NULLOK SV* only_utf8_locale \ - |const bool invert -ES |void |put_charclass_bitmap_innards_invlist \ - |NN SV *sv \ - |NN SV* invlist -ES |void |put_range |NN SV* sv|UV start|const UV end \ - |const bool allow_literals -ES |void |dump_trie |NN const struct _reg_trie_data *trie\ - |NULLOK HV* widecharmap|NN AV *revcharmap\ - |U32 depth -ES |void |dump_trie_interim_list|NN const struct _reg_trie_data *trie\ - |NULLOK HV* widecharmap|NN AV *revcharmap\ - |U32 next_alloc|U32 depth -ES |void |dump_trie_interim_table|NN const struct _reg_trie_data *trie\ - |NULLOK HV* widecharmap|NN AV *revcharmap\ - |U32 next_alloc|U32 depth -ESR |bool |regtail_study |NN RExC_state_t *pRExC_state \ - |NN regnode_offset p|NN const regnode_offset val|U32 depth -# endif -# ifndef PERL_EXT_RE_BUILD -EiRT |UV* |_invlist_array_init |NN SV* const invlist|const bool will_have_0 -EiRT |UV |invlist_max |NN const SV* const invlist -EiRT |IV* |get_invlist_previous_index_addr|NN SV* invlist -EiT |void |invlist_set_previous_index|NN SV* const invlist|const IV index -EiRT |IV |invlist_previous_index|NN SV* const invlist -EiT |void |invlist_trim |NN SV* invlist -Ei |void |invlist_clear |NN SV* invlist -ES |void |_append_range_to_invlist |NN SV* const invlist|const UV start|const UV end -ES |void |invlist_replace_list_destroys_src|NN SV *dest|NN SV *src -S |void |initialize_invlist_guts|NN SV* invlist|const Size_t initial_size -# endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp Ei |void |invlist_extend |NN SV* const invlist|const UV len Ei |void |invlist_set_len|NN SV* const invlist|const UV len|const bool offset EiRT |UV |invlist_highest|NN SV* const invlist #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) m |void |_invlist_intersection |NN SV* const a|NN SV* const b|NN SV** i EXp |void |_invlist_intersection_maybe_complement_2nd \ |NULLOK SV* const a|NN SV* const b \ @@ -2394,25 +2366,25 @@ EpRX |const char *|form_cp_too_large_msg|const U8 which \ |const Size_t len \ |const UV cp #endif -#if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) EXp |void |_invlist_dump |NN PerlIO *file|I32 level \ |NN const char* const indent \ |NN SV* const invlist #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_OP_C) EiRT |STRLEN*|get_invlist_iter_addr |NN SV* invlist EiT |void |invlist_iterinit|NN SV* invlist EiRT |bool |invlist_iternext|NN SV* invlist|NN UV* start|NN UV* end EiT |void |invlist_iterfinish|NN SV* invlist #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) EXpR |SV* |_new_invlist_C_array|NN const UV* const list EXp |bool |_invlistEQ |NN SV* const a|NN SV* const b|const bool complement_b #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) EiT |const char *|get_regex_charset_name|const U32 flags|NN STRLEN* const lenp #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) \ +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) \ || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) \ || defined(PERL_IN_DOOP_C) @@ -2423,7 +2395,7 @@ EiRT |UV |_invlist_len |NN SV* const invlist EiRT |bool |_invlist_contains_cp|NN SV* const invlist|const UV cp EXpRT |SSize_t|_invlist_search |NN SV* const invlist|const UV cp #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#if defined(PERL_IN_REGEX_ENGINE) # ifndef PERL_EXT_RE_BUILD Ep |SV* |get_regclass_aux_data \ |NULLOK const regexp *prog \ @@ -2440,10 +2412,69 @@ Ep |SV* |get_re_gclass_aux_data \ |NULLOK SV **listsvp \ |NULLOK SV **lonly_utf8_locale \ |NULLOK SV **output_invlist +# endif #endif -Ep |void |regprop |NULLOK const regexp *prog|NN SV* sv|NN const regnode* o|NULLOK const regmatch_info *reginfo \ - |NULLOK const RExC_state_t *pRExC_state +#if defined(PERL_IN_REGCOMP_ANY) +Ep |I32 |make_trie |NN RExC_state_t *pRExC_state \ + |NN regnode *startbranch|NN regnode *first \ + |NN regnode *last|NN regnode *tail \ + |U32 word_count|U32 flags|U32 depth +Ep |regnode *|construct_ahocorasick_from_trie|NN RExC_state_t *pRExC_state \ + |NN regnode *source|U32 depth +# if defined(PERL_IN_REGCOMP_TRIE_C) && defined(DEBUGGING) +ES |void |dump_trie |NN const struct _reg_trie_data *trie\ + |NULLOK HV* widecharmap|NN AV *revcharmap\ + |U32 depth +ES |void |dump_trie_interim_list|NN const struct _reg_trie_data *trie\ + |NULLOK HV* widecharmap|NN AV *revcharmap\ + |U32 next_alloc|U32 depth +ES |void |dump_trie_interim_table|NN const struct _reg_trie_data *trie\ + |NULLOK HV* widecharmap|NN AV *revcharmap\ + |U32 next_alloc|U32 depth +# endif +#endif +#if defined(PERL_IN_REGEX_ENGINE) && defined(DEBUGGING) +EFp |int |re_indentf |NN const char *fmt|U32 depth|... Efp |int |re_printf |NN const char *fmt|... +Ep |void |debug_show_study_flags|U32 flags|NN const char *open_str \ + |NN const char *close_str +Ep |void |regprop |NULLOK const regexp *prog|NN SV* sv|NN const regnode* o \ + |NULLOK const regmatch_info *reginfo \ + |NULLOK const RExC_state_t *pRExC_state +Ep |void |debug_studydata|NN const char *where|NULLOK scan_data_t *data \ + |U32 depth|int is_inf|SSize_t min \ + |SSize_t stopmin|SSize_t delta +Ep |void |debug_peep |NN const char *str|NN const RExC_state_t *pRExC_state \ + |NULLOK regnode *scan|U32 depth|U32 flags +Ep |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \ + |NN const regnode *node \ + |NULLOK const regnode *last \ + |NULLOK const regnode *plast \ + |NN SV* sv|I32 indent|U32 depth +#endif +#if defined(PERL_IN_REGCOMP_DEBUG_C) && defined(DEBUGGING) +ES |void |regdump_intflags|NULLOK const char *lead| const U32 flags +ES |void |regdump_extflags|NULLOK const char *lead| const U32 flags +ES |void |put_code_point |NN SV* sv|UV c +ES |U8 |put_charclass_bitmap_innards|NN SV* sv \ + |NULLOK char* bitmap \ + |NULLOK SV* nonbitmap_invlist \ + |NULLOK SV* only_utf8_locale_invlist\ + |NULLOK const regnode * const node \ + |const U8 flags \ + |const bool force_as_is_display +ES |SV* |put_charclass_bitmap_innards_common \ + |NN SV* invlist \ + |NULLOK SV* posixes \ + |NULLOK SV* only_utf8 \ + |NULLOK SV* not_utf8 \ + |NULLOK SV* only_utf8_locale \ + |const bool invert +ES |void |put_charclass_bitmap_innards_invlist \ + |NN SV *sv \ + |NN SV* invlist +ES |void |put_range |NN SV* sv|UV start|const UV end \ + |const bool allow_literals #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) ERp |bool |is_grapheme |NN const U8 * strbeg|NN const U8 * s|NN const U8 *strend|const UV cp @@ -2451,7 +2482,7 @@ ERp |bool |is_grapheme |NN const U8 * strbeg|NN const U8 * s|NN const U8 *strend #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C) EXTp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const unsigned int flags #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_SV_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_SV_C) EpX |SV* |invlist_clone |NN SV* const invlist|NULLOK SV* newlist #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) @@ -838,7 +838,7 @@ #define dump_mstats(a) Perl_dump_mstats(aTHX_ a) #define get_mstats(a,b,c) Perl_get_mstats(aTHX_ a,b,c) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#if defined(PERL_IN_REGEX_ENGINE) #define check_regnode_after(a,b) Perl_check_regnode_after(aTHX_ a,b) #define regnext(a) Perl_regnext(aTHX_ a) #define regnode_after(a,b) Perl_regnode_after(aTHX_ a,b) @@ -964,12 +964,12 @@ #define my_memrchr S_my_memrchr # endif # if !(!defined(PERL_EXT_RE_BUILD)) -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +# if defined(PERL_IN_REGEX_ENGINE) #define get_re_gclass_aux_data(a,b,c,d,e,f) Perl_get_re_gclass_aux_data(aTHX_ a,b,c,d,e,f) # endif # endif # if !defined(PERL_EXT_RE_BUILD) -# if defined(PERL_IN_REGCOMP_C) +# if defined(PERL_IN_REGCOMP_INVLIST_C) #define _append_range_to_invlist(a,b,c) S__append_range_to_invlist(aTHX_ a,b,c) #define _invlist_array_init S__invlist_array_init #define get_invlist_previous_index_addr S_get_invlist_previous_index_addr @@ -980,26 +980,12 @@ #define invlist_set_previous_index S_invlist_set_previous_index #define invlist_trim S_invlist_trim # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +# if defined(PERL_IN_REGEX_ENGINE) #define get_regclass_aux_data(a,b,c,d,e,f) Perl_get_regclass_aux_data(aTHX_ a,b,c,d,e,f) # endif # endif # if defined(DEBUGGING) # if defined(PERL_IN_REGCOMP_C) -#define dump_trie(a,b,c,d) S_dump_trie(aTHX_ a,b,c,d) -#define dump_trie_interim_list(a,b,c,d,e) S_dump_trie_interim_list(aTHX_ a,b,c,d,e) -#define dump_trie_interim_table(a,b,c,d,e) S_dump_trie_interim_table(aTHX_ a,b,c,d,e) -#define dumpuntil(a,b,c,d,e,f,g,h) S_dumpuntil(aTHX_ a,b,c,d,e,f,g,h) -#define put_charclass_bitmap_innards(a,b,c,d,e,f,g) S_put_charclass_bitmap_innards(aTHX_ a,b,c,d,e,f,g) -#define put_charclass_bitmap_innards_common(a,b,c,d,e,f) S_put_charclass_bitmap_innards_common(aTHX_ a,b,c,d,e,f) -#define put_charclass_bitmap_innards_invlist(a,b) S_put_charclass_bitmap_innards_invlist(aTHX_ a,b) -#define put_code_point(a,b) S_put_code_point(aTHX_ a,b) -#define put_range(a,b,c,d) S_put_range(aTHX_ a,b,c,d) -#if !defined(MULTIPLICITY) || defined(PERL_CORE) -#define re_indentf(a,...) Perl_re_indentf(aTHX_ a,__VA_ARGS__) -#endif -#define regdump_extflags(a,b) S_regdump_extflags(aTHX_ a,b) -#define regdump_intflags(a,b) S_regdump_intflags(aTHX_ a,b) #define regnode_guts_debug(a,b,c) S_regnode_guts_debug(aTHX_ a,b,c) #define regtail_study(a,b,c,d) S_regtail_study(aTHX_ a,b,c,d) # endif @@ -1035,20 +1021,80 @@ #define mem_collxfrm_(a,b,c,d) Perl_mem_collxfrm_(aTHX_ a,b,c,d) # endif # endif +# if defined(PERL_IN_REGCOMP_ANY) +#define add_above_Latin1_folds(a,b,c) Perl_add_above_Latin1_folds(aTHX_ a,b,c) +#define construct_ahocorasick_from_trie(a,b,c) Perl_construct_ahocorasick_from_trie(aTHX_ a,b,c) +#define get_ANYOFHbbm_contents(a) Perl_get_ANYOFHbbm_contents(aTHX_ a) +#define get_ANYOFM_contents(a) Perl_get_ANYOFM_contents(aTHX_ a) +#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) +#define invlist_highest_range_start S_invlist_highest_range_start +#define invlist_is_iterating S_invlist_is_iterating +#define invlist_lowest S_invlist_lowest +#define is_ssc_worth_it Perl_is_ssc_worth_it +#define join_exact(a,b,c,d,e,f,g) Perl_join_exact(aTHX_ a,b,c,d,e,f,g) +#define make_trie(a,b,c,d,e,f,g,h) Perl_make_trie(aTHX_ a,b,c,d,e,f,g,h) +#define populate_anyof_bitmap_from_invlist(a,b) Perl_populate_anyof_bitmap_from_invlist(aTHX_ a,b) +#define reg_add_data Perl_reg_add_data +#define scan_commit(a,b,c,d) Perl_scan_commit(aTHX_ a,b,c,d) +#define set_ANYOF_arg(a,b,c,d,e) Perl_set_ANYOF_arg(aTHX_ a,b,c,d,e) +#define ssc_finalize(a,b) Perl_ssc_finalize(aTHX_ a,b) +#define ssc_init(a,b) Perl_ssc_init(aTHX_ a,b) +#define study_chunk(a,b,c,d,e,f,g,h,i,j,k,l) Perl_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k,l) +# if defined(PERL_IN_REGCOMP_TRIE_C) && defined(DEBUGGING) +#define dump_trie(a,b,c,d) S_dump_trie(aTHX_ a,b,c,d) +#define dump_trie_interim_list(a,b,c,d,e) S_dump_trie_interim_list(aTHX_ a,b,c,d,e) +#define dump_trie_interim_table(a,b,c,d,e) S_dump_trie_interim_table(aTHX_ a,b,c,d,e) +# endif +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) +#define _invlist_dump(a,b,c,d) Perl__invlist_dump(aTHX_ a,b,c,d) +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) +#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) +#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) +#define invlist_highest S_invlist_highest +#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) +#define _add_range_to_invlist(a,b,c) Perl__add_range_to_invlist(aTHX_ a,b,c) +#define _invlist_intersection_maybe_complement_2nd(a,b,c,d) Perl__invlist_intersection_maybe_complement_2nd(aTHX_ a,b,c,d) +#define _invlist_invert(a) Perl__invlist_invert(aTHX_ a) +#define _invlist_union_maybe_complement_2nd(a,b,c,d) Perl__invlist_union_maybe_complement_2nd(aTHX_ a,b,c,d) +#define _new_invlist(a) Perl__new_invlist(aTHX_ a) +#define _setup_canned_invlist(a,b,c) Perl__setup_canned_invlist(aTHX_ a,b,c) +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_OP_C) +#define get_invlist_iter_addr S_get_invlist_iter_addr +#define invlist_iterfinish S_invlist_iterfinish +#define invlist_iterinit S_invlist_iterinit +#define invlist_iternext S_invlist_iternext +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) +#define _invlistEQ(a,b,c) Perl__invlistEQ(aTHX_ a,b,c) +#define _new_invlist_C_array(a) Perl__new_invlist_C_array(aTHX_ a) +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) +#define get_regex_charset_name S_get_regex_charset_name +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_DOOP_C) +#define _invlist_contains_cp S__invlist_contains_cp +#define _invlist_len S__invlist_len +#define _invlist_search Perl__invlist_search +#define get_invlist_offset_addr S_get_invlist_offset_addr +#define invlist_array S_invlist_array +#define is_invlist S_is_invlist +# endif +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_SV_C) +#define invlist_clone(a,b) Perl_invlist_clone(aTHX_ a,b) +# endif # if defined(PERL_IN_REGCOMP_C) -#define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) -#define add_data S_add_data #define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c) #define change_engine_size(a,b) S_change_engine_size(aTHX_ a,b) #define compile_wildcard(a,b,c) S_compile_wildcard(aTHX_ a,b,c) #define compute_EXACTish S_compute_EXACTish -#define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c) #define edit_distance S_edit_distance #define execute_wildcard(a,b,c,d,e,f,g) S_execute_wildcard(aTHX_ a,b,c,d,e,f,g) #define find_first_differing_byte_pos S_find_first_differing_byte_pos -#define get_ANYOFHbbm_contents(a) S_get_ANYOFHbbm_contents(aTHX_ a) -#define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a) -#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) #define get_quantifier_value(a,b,c) S_get_quantifier_value(aTHX_ a,b,c) #define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) @@ -1056,23 +1102,11 @@ #define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e) #define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) -#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) -#define invlist_highest_range_start S_invlist_highest_range_start -#define invlist_is_iterating S_invlist_is_iterating -#define invlist_lowest S_invlist_lowest -#define is_ssc_worth_it S_is_ssc_worth_it -#define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g) -#define make_exactf_invlist(a,b) S_make_exactf_invlist(aTHX_ a,b) -#define make_trie(a,b,c,d,e,f,g,h) S_make_trie(aTHX_ a,b,c,d,e,f,g,h) #define nextchar(a) S_nextchar(aTHX_ a) #define optimize_regclass(a,b,c,d,e,f,g,h,i,j) S_optimize_regclass(aTHX_ a,b,c,d,e,f,g,h,i,j) #define output_posix_warnings(a,b) S_output_posix_warnings(aTHX_ a,b) #define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a) #define parse_uniprop_string(a,b,c,d,e,f,g,h,i,j) S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i,j) -#define populate_anyof_bitmap_from_invlist(a,b) S_populate_anyof_bitmap_from_invlist(aTHX_ a,b) -#define populate_bitmap_from_invlist(a,b,c,d) S_populate_bitmap_from_invlist(aTHX_ a,b,c,d) -#define populate_invlist_from_bitmap(a,b,c,d) S_populate_invlist_from_bitmap(aTHX_ a,b,c,d) -#define rck_elide_nothing(a) S_rck_elide_nothing(aTHX_ a) #define reg(a,b,c,d) S_reg(aTHX_ a,b,c,d) #define reg2Lanode(a,b,c,d) S_reg2Lanode(aTHX_ a,b,c,d) #define reg_la_NOTHING(a,b,c) S_reg_la_NOTHING(aTHX_ a,b,c) @@ -1090,40 +1124,8 @@ #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define regpnode(a,b,c) S_regpnode(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) -#define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d) -#define set_ANYOF_arg(a,b,c,d,e) S_set_ANYOF_arg(aTHX_ a,b,c,d,e) #define set_regex_pv(a,b) S_set_regex_pv(aTHX_ a,b) #define skip_to_be_ignored_text(a,b,c) S_skip_to_be_ignored_text(aTHX_ a,b,c) -#define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c) -#define ssc_and(a,b,c) S_ssc_and(aTHX_ a,b,c) -#define ssc_anything(a) S_ssc_anything(aTHX_ a) -#define ssc_clear_locale S_ssc_clear_locale -#define ssc_cp_and(a,b) S_ssc_cp_and(aTHX_ a,b) -#define ssc_finalize(a,b) S_ssc_finalize(aTHX_ a,b) -#define ssc_init(a,b) S_ssc_init(aTHX_ a,b) -#define ssc_intersection(a,b,c) S_ssc_intersection(aTHX_ a,b,c) -#define ssc_is_anything S_ssc_is_anything -#define ssc_is_cp_posixl_init S_ssc_is_cp_posixl_init -#define ssc_or(a,b,c) S_ssc_or(aTHX_ a,b,c) -#define ssc_union(a,b,c) S_ssc_union(aTHX_ a,b,c) -#define study_chunk(a,b,c,d,e,f,g,h,i,j,k,l) S_study_chunk(aTHX_ a,b,c,d,e,f,g,h,i,j,k,l) -# endif -# if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) -#define _invlist_dump(a,b,c,d) Perl__invlist_dump(aTHX_ a,b,c,d) -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) -#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) -#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) -#define invlist_highest S_invlist_highest -#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) -#define _add_range_to_invlist(a,b,c) Perl__add_range_to_invlist(aTHX_ a,b,c) -#define _invlist_intersection_maybe_complement_2nd(a,b,c,d) Perl__invlist_intersection_maybe_complement_2nd(aTHX_ a,b,c,d) -#define _invlist_invert(a) Perl__invlist_invert(aTHX_ a) -#define _invlist_union_maybe_complement_2nd(a,b,c,d) Perl__invlist_union_maybe_complement_2nd(aTHX_ a,b,c,d) -#define _new_invlist(a) Perl__new_invlist(aTHX_ a) -#define _setup_canned_invlist(a,b,c) Perl__setup_canned_invlist(aTHX_ a,b,c) # endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) #define form_alien_digit_msg(a,b,c,d,e,f) Perl_form_alien_digit_msg(aTHX_ a,b,c,d,e,f) @@ -1134,45 +1136,44 @@ # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) #define form_cp_too_large_msg(a,b,c,d) Perl_form_cp_too_large_msg(aTHX_ a,b,c,d) # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) -#define get_invlist_iter_addr S_get_invlist_iter_addr -#define invlist_iterfinish S_invlist_iterfinish -#define invlist_iterinit S_invlist_iterinit -#define invlist_iternext S_invlist_iternext -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) -#define _invlistEQ(a,b,c) Perl__invlistEQ(aTHX_ a,b,c) -#define _new_invlist_C_array(a) Perl__new_invlist_C_array(aTHX_ a) -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) -#define get_regex_charset_name S_get_regex_charset_name -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) -#if !defined(MULTIPLICITY) || defined(PERL_CORE) -#define re_printf(...) Perl_re_printf(aTHX_ __VA_ARGS__) -#endif -#define regprop(a,b,c,d,e) Perl_regprop(aTHX_ a,b,c,d,e) -# endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_DOOP_C) -#define _invlist_contains_cp S__invlist_contains_cp -#define _invlist_len S__invlist_len -#define _invlist_search Perl__invlist_search -#define get_invlist_offset_addr S_get_invlist_offset_addr -#define invlist_array S_invlist_array -#define is_invlist S_is_invlist -# endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) #define is_grapheme(a,b,c,d) Perl_is_grapheme(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_UTF8_C) #define _to_fold_latin1 Perl__to_fold_latin1 # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_SV_C) -#define invlist_clone(a,b) Perl_invlist_clone(aTHX_ a,b) -# endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) #define regcurly Perl_regcurly # endif +# if defined(PERL_IN_REGCOMP_DEBUG_C) && defined(DEBUGGING) +#define put_charclass_bitmap_innards(a,b,c,d,e,f,g) S_put_charclass_bitmap_innards(aTHX_ a,b,c,d,e,f,g) +#define put_charclass_bitmap_innards_common(a,b,c,d,e,f) S_put_charclass_bitmap_innards_common(aTHX_ a,b,c,d,e,f) +#define put_charclass_bitmap_innards_invlist(a,b) S_put_charclass_bitmap_innards_invlist(aTHX_ a,b) +#define put_code_point(a,b) S_put_code_point(aTHX_ a,b) +#define put_range(a,b,c,d) S_put_range(aTHX_ a,b,c,d) +#define regdump_extflags(a,b) S_regdump_extflags(aTHX_ a,b) +#define regdump_intflags(a,b) S_regdump_intflags(aTHX_ a,b) +# endif +# if defined(PERL_IN_REGCOMP_INVLIST_C) || defined(PERL_IN_REGCOMP_C) +#define populate_bitmap_from_invlist(a,b,c,d) Perl_populate_bitmap_from_invlist(aTHX_ a,b,c,d) +#define populate_invlist_from_bitmap(a,b,c,d) Perl_populate_invlist_from_bitmap(aTHX_ a,b,c,d) +# endif +# if defined(PERL_IN_REGCOMP_STUDY_C) +#define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) +#define make_exactf_invlist(a,b) S_make_exactf_invlist(aTHX_ a,b) +#define rck_elide_nothing(a) S_rck_elide_nothing(aTHX_ a) +#define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c) +#define ssc_and(a,b,c) S_ssc_and(aTHX_ a,b,c) +#define ssc_anything(a) S_ssc_anything(aTHX_ a) +#define ssc_clear_locale S_ssc_clear_locale +#define ssc_cp_and(a,b) S_ssc_cp_and(aTHX_ a,b) +#define ssc_intersection(a,b,c) S_ssc_intersection(aTHX_ a,b,c) +#define ssc_is_anything S_ssc_is_anything +#define ssc_is_cp_posixl_init S_ssc_is_cp_posixl_init +#define ssc_or(a,b,c) S_ssc_or(aTHX_ a,b,c) +#define ssc_union(a,b,c) S_ssc_union(aTHX_ a,b,c) +#define unwind_scan_frames(a) S_unwind_scan_frames(aTHX_ a) +# endif # if defined(PERL_IN_REGEXEC_C) #define advance_one_LB(a,b,c) S_advance_one_LB(aTHX_ a,b,c) #define advance_one_SB(a,b,c) S_advance_one_SB(aTHX_ a,b,c) @@ -1206,6 +1207,19 @@ #define to_byte_substr(a) S_to_byte_substr(aTHX_ a) #define to_utf8_substr(a) S_to_utf8_substr(aTHX_ a) # endif +# if defined(PERL_IN_REGEX_ENGINE) && defined(DEBUGGING) +#define debug_peep(a,b,c,d,e) Perl_debug_peep(aTHX_ a,b,c,d,e) +#define debug_show_study_flags(a,b,c) Perl_debug_show_study_flags(aTHX_ a,b,c) +#define debug_studydata(a,b,c,d,e,f,g) Perl_debug_studydata(aTHX_ a,b,c,d,e,f,g) +#define dumpuntil(a,b,c,d,e,f,g,h) Perl_dumpuntil(aTHX_ a,b,c,d,e,f,g,h) +#if !defined(MULTIPLICITY) || defined(PERL_CORE) +#define re_indentf(a,...) Perl_re_indentf(aTHX_ a,__VA_ARGS__) +#endif +#if !defined(MULTIPLICITY) || defined(PERL_CORE) +#define re_printf(...) Perl_re_printf(aTHX_ __VA_ARGS__) +#endif +#define regprop(a,b,c,d,e) Perl_regprop(aTHX_ a,b,c,d,e) +# endif #endif #ifdef PERL_CORE #define PerlLIO_dup2_cloexec(a,b) Perl_PerlLIO_dup2_cloexec(aTHX_ a,b) @@ -1570,7 +1584,7 @@ # endif # endif # if !defined(PERL_EXT_RE_BUILD) -# if defined(PERL_IN_REGCOMP_C) +# if defined(PERL_IN_REGCOMP_INVLIST_C) #define initialize_invlist_guts(a,b) S_initialize_invlist_guts(aTHX_ a,b) # endif # endif diff --git a/ext/re/Makefile.PL b/ext/re/Makefile.PL index f3bdcfdc69..8d4a576e4c 100644 --- a/ext/re/Makefile.PL +++ b/ext/re/Makefile.PL @@ -1,8 +1,31 @@ +use strict; +use warnings; use ExtUtils::MakeMaker; use File::Spec; use Config; +# [ src => @deps ] +our @files = ( + # compiler files ######################################## + ['regcomp.c' => 'dquote.c', 'invlist_inline.h' ], + ['regcomp_invlist.c' => 'invlist_inline.h' ], + ['regcomp_study.c' ], + ['regcomp_trie.c' ], + ['regcomp_debug.c' ], + # execution engine files ################################ + ['regexec.c' => 'invlist_inline.h' ], + # misc files ############################################ + ['dquote.c' ], + ['invlist_inline.h' ], + ######################################################### +); -my $object = 're_exec$(OBJ_EXT) re_comp$(OBJ_EXT) re$(OBJ_EXT)'; +my @objects = 're$(OBJ_EXT)'; +foreach my $tuple (@files) { + my $src_file = $tuple->[0]; + if ($src_file=~s/reg/re_/ and $src_file=~s/\.c/\$(OBJ_EXT)/) { + push @objects, $src_file; + } +} my $defines = '-DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT'; @@ -15,45 +38,41 @@ WriteMakefile( @libs ? ( 'LIBS' => [ join(" ", map { "-l$_" } @libs) ] ) : (), VERSION_FROM => 're.pm', XSPROTOARG => '-noprototypes', - OBJECT => $object, + OBJECT => "@objects", DEFINE => $defines, clean => { FILES => '*$(OBJ_EXT) invlist_inline.h *.c ../../lib/re.pm' }, ); package MY; - sub upupfile { File::Spec->catfile(File::Spec->updir, File::Spec->updir, $_[0]); } sub postamble { - my $regcomp_c = upupfile('regcomp.c'); - my $regexec_c = upupfile('regexec.c'); - my $dquote_c = upupfile('dquote.c'); - my $invlist_inline_h = upupfile('invlist_inline.h'); + my $postamble = ""; + foreach my $tuple (@::files) { + my ($file, @deps) = @$tuple; + my $src_file = upupfile($file); + my $target = $file; + $target =~ s/^reg/re_/; + $postamble .= <<EOF; -re_comp.c : $regcomp_c - - \$(RM_F) re_comp.c - \$(CP) $regcomp_c re_comp.c - -re_comp\$(OBJ_EXT) : re_comp.c dquote.c invlist_inline.h +$target : $src_file + - \$(RM_F) $target + \$(CP) $src_file $target -re_exec.c : $regexec_c - - \$(RM_F) re_exec.c - \$(CP) $regexec_c re_exec.c - -re_exec\$(OBJ_EXT) : re_exec.c invlist_inline.h - -dquote.c : $dquote_c - - \$(RM_F) dquote.c - \$(CP) $dquote_c dquote.c - -invlist_inline.h : $invlist_inline_h - - \$(RM_F) invlist_inline.h - \$(CP) $invlist_inline_h invlist_inline.h +EOF + next if $target eq $file; + my $base_name = $target; + if ($base_name=~s/\.c\z//) { + $postamble .= <<EOF +$base_name\$(OBJ_EXT) : $target @deps EOF + } + } + return $postamble } sub MY::c_o { @@ -1685,7 +1685,7 @@ END_EXTERN_C # endif /* Participates in a single-character fold with a character above 255 */ -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_REGEXEC_C) # define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(c) \ (( ! cBOOL(FITS_IN_8_BITS(c))) \ || (PL_charclass[(U8) (c)] & CC_mask_(CC_NONLATIN1_SIMPLE_FOLD_))) @@ -2639,7 +2639,7 @@ S_lossless_NV_to_IV(const NV nv, IV *ivp) /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ -#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) +#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) #define MAX_CHARSET_NAME_LENGTH 2 diff --git a/invlist_inline.h b/invlist_inline.h index ebb37b6543..8b28c21885 100644 --- a/invlist_inline.h +++ b/invlist_inline.h @@ -10,7 +10,7 @@ #define PERL_INVLIST_INLINE_H_ #if defined(PERL_IN_UTF8_C) \ - || defined(PERL_IN_REGCOMP_C) \ + || defined(PERL_IN_REGCOMP_ANY) \ || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_TOKE_C) \ || defined(PERL_IN_PP_C) \ @@ -93,7 +93,7 @@ S_invlist_array(SV* const invlist) } #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -161,7 +161,7 @@ S_invlist_highest(SV* const invlist) : array[len - 1] - 1; } -# if defined(PERL_IN_REGCOMP_C) +# if defined(PERL_IN_REGCOMP_ANY) PERL_STATIC_INLINE UV S_invlist_highest_range_start(SV* const invlist) @@ -199,7 +199,7 @@ S_invlist_highest_range_start(SV* const invlist) # endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_OP_C) PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) @@ -275,11 +275,101 @@ S_invlist_iternext(SV* invlist, UV* start, UV* end) #endif -#ifndef PERL_IN_REGCOMP_C +#ifndef PERL_IN_REGCOMP_ANY /* These symbols are only needed later in regcomp.c */ # undef TO_INTERNAL_SIZE # undef FROM_INTERNAL_SIZE #endif +#ifdef PERL_IN_REGCOMP_ANY +PERL_STATIC_INLINE +bool +S_invlist_is_iterating(const SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + /* get_invlist_iter_addr()'s sv is non-const only because it returns a + * value that can be used to modify the invlist, it doesn't modify the + * invlist itself */ + return *(get_invlist_iter_addr((SV*)invlist)) < (STRLEN) UV_MAX; +} + +PERL_STATIC_INLINE +SV * +S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. If 'traditional_style' is TRUE, it uses the format + * traditionally done for debug tracing; otherwise it uses a format + * suitable for just copying to the output, with blanks between ranges and + * a dash between range components */ + + UV start, end; + SV* output; + const char intra_range_delimiter = (traditional_style ? '\t' : '-'); + const char inter_range_delimiter = (traditional_style ? '\n' : ' '); + + if (traditional_style) { + output = newSVpvs("\n"); + } + else { + output = newSVpvs(""); + } + + PERL_ARGS_ASSERT_INVLIST_CONTENTS; + + assert(! invlist_is_iterating(invlist)); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c", + start, intra_range_delimiter, + inter_range_delimiter); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c", + start, + intra_range_delimiter, + end, inter_range_delimiter); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c", + start, inter_range_delimiter); + } + } + + if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ + SvCUR_set(output, SvCUR(output) - 1); + } + + return output; +} + +PERL_STATIC_INLINE +UV +S_invlist_lowest(SV* const invlist) +{ + /* Returns the lowest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the lowest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = _invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_LOWEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + return array[0]; +} + +#endif + #endif /* PERL_INVLIST_INLINE_H_ */ @@ -4506,8 +4506,7 @@ typedef struct crypt_data { /* straight from /usr/include/crypt.h */ typedef struct magic_state MGS; /* struct magic_state defined in mg.c */ -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) \ - || defined(PERL_EXT_RE_BUILD) +#if defined(PERL_IN_REGEX_ENGINE) || defined(PERL_EXT_RE_BUILD) /* These have to be predeclared, as they are used in proto.h which is #included * before their definitions in regcomp.h. */ @@ -4523,6 +4522,7 @@ typedef struct regnode_charclass_posixl regnode_charclass_posixl; typedef struct regnode_ssc regnode_ssc; typedef struct RExC_state_t RExC_state_t; struct _reg_trie_data; +typedef struct scan_data_t scan_data_t; #endif @@ -4705,7 +4705,7 @@ PERL_STATIC_INLINE void * S_my_memrchr(const char * s, const char c, const STRLE #endif #endif #if !(!defined(PERL_EXT_RE_BUILD)) -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +# if defined(PERL_IN_REGEX_ENGINE) PERL_CALLCONV SV* Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const struct regnode *node, bool doinit, SV **listsvp, SV **lonly_utf8_locale, SV **output_invlist) __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA \ @@ -4887,7 +4887,7 @@ STATIC PerlIO * S_doopen_pm(pTHX_ SV *name) # endif #endif #if !defined(PERL_EXT_RE_BUILD) -# if defined(PERL_IN_REGCOMP_C) +# if defined(PERL_IN_REGCOMP_INVLIST_C) STATIC void S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end); #define PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST \ assert(invlist) @@ -4941,7 +4941,7 @@ PERL_STATIC_INLINE void S_invlist_trim(SV* invlist); assert(invlist) #endif # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +# if defined(PERL_IN_REGEX_ENGINE) PERL_CALLCONV SV* Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const struct regnode *node, bool doinit, SV **listsvp, SV **lonly_utf8_locale, SV **output_invlist) __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA \ @@ -5163,42 +5163,6 @@ STATIC void S_cv_dump(pTHX_ const CV *cv, const char *title); assert(cv); assert(title) # endif # if defined(PERL_IN_REGCOMP_C) -STATIC void S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 depth); -#define PERL_ARGS_ASSERT_DUMP_TRIE \ - assert(trie); assert(revcharmap) -STATIC void S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 next_alloc, U32 depth); -#define PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST \ - assert(trie); assert(revcharmap) -STATIC void S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 next_alloc, U32 depth); -#define PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE \ - assert(trie); assert(revcharmap) -STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth); -#define PERL_ARGS_ASSERT_DUMPUNTIL \ - assert(r); assert(start); assert(node); assert(sv) -STATIC U8 S_put_charclass_bitmap_innards(pTHX_ SV* sv, char* bitmap, SV* nonbitmap_invlist, SV* only_utf8_locale_invlist, const regnode * const node, const U8 flags, const bool force_as_is_display); -#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS \ - assert(sv) -STATIC SV* S_put_charclass_bitmap_innards_common(pTHX_ SV* invlist, SV* posixes, SV* only_utf8, SV* not_utf8, SV* only_utf8_locale, const bool invert); -#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON \ - assert(invlist) -STATIC void S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist); -#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST \ - assert(sv); assert(invlist) -STATIC void S_put_code_point(pTHX_ SV* sv, UV c); -#define PERL_ARGS_ASSERT_PUT_CODE_POINT \ - assert(sv) -STATIC void S_put_range(pTHX_ SV* sv, UV start, const UV end, const bool allow_literals); -#define PERL_ARGS_ASSERT_PUT_RANGE \ - assert(sv) -PERL_CALLCONV int Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) - __attribute__visibility__("hidden"); -#define PERL_ARGS_ASSERT_RE_INDENTF \ - assert(fmt) - -STATIC void S_regdump_extflags(pTHX_ const char *lead, const U32 flags); -#define PERL_ARGS_ASSERT_REGDUMP_EXTFLAGS -STATIC void S_regdump_intflags(pTHX_ const char *lead, const U32 flags); -#define PERL_ARGS_ASSERT_REGDUMP_INTFLAGS STATIC regnode_offset S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len); #define PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG \ assert(pRExC_state) @@ -6411,79 +6375,29 @@ STATIC OP* S_doform(pTHX_ CV *cv, GV *gv, OP *retop); STATIC SV * S_space_join_names_mortal(pTHX_ char *const *array); #define PERL_ARGS_ASSERT_SPACE_JOIN_NAMES_MORTAL #endif -#if defined(PERL_IN_REGCOMP_C) -STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist); +#if defined(PERL_IN_REGCOMP_ANY) +PERL_CALLCONV void Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \ assert(pRExC_state); assert(invlist) -STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_ADD_DATA \ - assert(pRExC_state); assert(s) -STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count); -#define PERL_ARGS_ASSERT_ADD_MULTI_MATCH \ - assert(multi_string) -STATIC void S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size); -#define PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE \ - assert(pRExC_state) -STATIC REGEXP* S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len, const bool ignore_case) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_COMPILE_WILDCARD \ - assert(subpattern) - -STATIC U8 S_compute_EXACTish(RExC_state_t *pRExC_state); -#define PERL_ARGS_ASSERT_COMPUTE_EXACTISH \ - assert(pRExC_state) -STATIC regnode * S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth); +PERL_CALLCONV regnode * Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE \ assert(pRExC_state); assert(source) -STATIC int S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const STRLEN y, const SSize_t maxDistance) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_EDIT_DISTANCE \ - assert(src); assert(tgt) -STATIC I32 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char* strend, char* strbeg, SSize_t minend, SV* screamer, U32 nosave); -#define PERL_ARGS_ASSERT_EXECUTE_WILDCARD \ - assert(prog); assert(stringarg); assert(strend); assert(strbeg); assert(screamer) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE Size_t S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max); -#define PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS \ - assert(s1); assert(s2) -#endif -STATIC SV * S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) - __attribute__warn_unused_result__; +PERL_CALLCONV SV * Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) + __attribute__warn_unused_result__ + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS \ assert(n) -STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n) - __attribute__warn_unused_result__; +PERL_CALLCONV SV * Perl_get_ANYOFM_contents(pTHX_ const regnode * n) + __attribute__warn_unused_result__ + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS \ assert(n) -STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); -#define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ - assert(pRExC_state); assert(node) -STATIC U32 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, const char * start, const char * end); -#define PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE \ - assert(pRExC_state); assert(start); assert(end) -STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); -#define PERL_ARGS_ASSERT_GROK_BSLASH_N \ - assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * backref_parse_start, char ch); -#define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \ - assert(pRExC_state); assert(flagp); assert(backref_parse_start) -STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings); -#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \ - assert(wname); assert(prop_definition); assert(strings) -STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only); -#define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \ - assert(pRExC_state); assert(s) -STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth); -#define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \ - assert(pRExC_state); assert(flagp) -STATIC SV * S_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level); -#define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \ - assert(name); assert(contents); assert(user_defined_ptr); assert(msg) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) __attribute__warn_unused_result__; @@ -6512,171 +6426,75 @@ PERL_STATIC_INLINE UV S_invlist_lowest(SV* const invlist) assert(invlist) #endif -STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc); +PERL_CALLCONV bool Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \ assert(pRExC_state); assert(ssc) -STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *unfolded_multi_char, U32 flags, regnode *val, U32 depth); + +PERL_CALLCONV U32 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *unfolded_multi_char, U32 flags, regnode *val, U32 depth) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_JOIN_EXACT \ assert(pRExC_state); assert(scan); assert(min_subtract); assert(unfolded_multi_char) -STATIC SV* S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST \ - assert(pRExC_state); assert(node) -STATIC I32 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth); +PERL_CALLCONV I32 Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_MAKE_TRIE \ assert(pRExC_state); assert(startbranch); assert(first); assert(last); assert(tail) -STATIC void S_nextchar(pTHX_ RExC_state_t *pRExC_state); -#define PERL_ARGS_ASSERT_NEXTCHAR \ - assert(pRExC_state) -STATIC U8 S_optimize_regclass(pTHX_ RExC_state_t *pRExC_state, SV* cp_list, SV* only_utf8_locale_list, SV* upper_latin1_only_utf8_matches, const U32 has_runtime_dependency, const U32 posixl, U8 * anyof_flags, bool * invert, regnode_offset * ret, I32 *flagp); -#define PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS \ - assert(pRExC_state); assert(anyof_flags); assert(invert); assert(ret); assert(flagp) -STATIC void S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings); -#define PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS \ - assert(pRExC_state); assert(posix_warnings) -STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state); -#define PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS \ - assert(pRExC_state) -STATIC SV * S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, AV ** strings, bool * user_defined_ptr, SV * msg, const STRLEN level); -#define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING \ - assert(name); assert(user_defined_ptr); assert(msg) -STATIC void S_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr); + +PERL_CALLCONV void Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST \ assert(node); assert(invlist_ptr) -STATIC void S_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len); -#define PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST \ - assert(invlist); assert(bitmap) -STATIC void S_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset); -#define PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP \ - assert(bitmap); assert(invlist) -STATIC void S_rck_elide_nothing(pTHX_ regnode *node); -#define PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING \ - assert(node) -PERL_STATIC_NO_RET void S_re_croak(pTHX_ bool utf8, const char* pat, ...) - __attribute__noreturn__ - __attribute__format__(__printf__,pTHX_2,pTHX_3); -#define PERL_ARGS_ASSERT_RE_CROAK \ - assert(pat) - -STATIC regnode_offset S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth); -#define PERL_ARGS_ASSERT_REG \ - assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2); -#define PERL_ARGS_ASSERT_REG2LANODE \ - assert(pRExC_state) -STATIC regnode_offset S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, const char *type); -#define PERL_ARGS_ASSERT_REG_LA_NOTHING \ - assert(pRExC_state); assert(type) -STATIC regnode_offset S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags, const char *type); -#define PERL_ARGS_ASSERT_REG_LA_OPFAIL \ - assert(pRExC_state); assert(type) -STATIC regnode_offset S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op); -#define PERL_ARGS_ASSERT_REG_NODE \ - assert(pRExC_state) -STATIC SV * S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags); -#define PERL_ARGS_ASSERT_REG_SCAN_NAME \ - assert(pRExC_state) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE char * S_reg_skipcomment(RExC_state_t *pRExC_state, char * p); -#define PERL_ARGS_ASSERT_REG_SKIPCOMMENT \ - assert(pRExC_state); assert(p) -#endif -STATIC regnode_offset S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg); -#define PERL_ARGS_ASSERT_REGANODE \ - assert(pRExC_state) -STATIC regnode_offset S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); -#define PERL_ARGS_ASSERT_REGATOM \ - assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth); -#define PERL_ARGS_ASSERT_REGBRANCH \ - assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, bool optimizable, SV** ret_invlist); -#define PERL_ARGS_ASSERT_REGCLASS \ - assert(pRExC_state); assert(flagp) -STATIC unsigned int S_regex_set_precedence(const U8 my_operator) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_REGEX_SET_PRECEDENCE -STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, const regnode_offset operand, const U32 depth); -#define PERL_ARGS_ASSERT_REGINSERT \ - assert(pRExC_state) -STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_len); -#define PERL_ARGS_ASSERT_REGNODE_GUTS \ - assert(pRExC_state) -STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); -#define PERL_ARGS_ASSERT_REGPIECE \ - assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg); -#define PERL_ARGS_ASSERT_REGPNODE \ - assert(pRExC_state); assert(arg) -STATIC bool S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, const U32 depth) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_REGTAIL \ - assert(pRExC_state); assert(p); assert(val) +PERL_CALLCONV U32 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) + __attribute__warn_unused_result__ + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_REG_ADD_DATA \ + assert(pRExC_state); assert(s) -STATIC void S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf); +PERL_CALLCONV void Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_data_t *data, SSize_t *minlenp, int is_inf) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_SCAN_COMMIT \ assert(pRExC_state); assert(data); assert(minlenp) -STATIC void S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, SV* const only_utf8_locale_list); + +PERL_CALLCONV void Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, SV* const only_utf8_locale_list) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_SET_ANYOF_ARG \ assert(pRExC_state); assert(node) -STATIC void S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx); -#define PERL_ARGS_ASSERT_SET_REGEX_PV \ - assert(pRExC_state); assert(Rx) -STATIC void S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, char ** p, const bool force_to_xmod); -#define PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT \ - assert(pRExC_state); assert(p) -STATIC void S_ssc_add_range(pTHX_ regnode_ssc *ssc, UV const start, UV const end); -#define PERL_ARGS_ASSERT_SSC_ADD_RANGE \ - assert(ssc) -STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_charclass *and_with); -#define PERL_ARGS_ASSERT_SSC_AND \ - assert(pRExC_state); assert(ssc); assert(and_with) -STATIC void S_ssc_anything(pTHX_ regnode_ssc *ssc); -#define PERL_ARGS_ASSERT_SSC_ANYTHING \ - assert(ssc) -STATIC void S_ssc_clear_locale(regnode_ssc *ssc); -#define PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE \ - assert(ssc) -STATIC void S_ssc_cp_and(pTHX_ regnode_ssc *ssc, UV const cp); -#define PERL_ARGS_ASSERT_SSC_CP_AND \ - assert(ssc) -STATIC void S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc); + +PERL_CALLCONV void Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_SSC_FINALIZE \ assert(pRExC_state); assert(ssc) -STATIC void S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc); -#define PERL_ARGS_ASSERT_SSC_INIT \ - assert(pRExC_state); assert(ssc) -STATIC void S_ssc_intersection(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert_2nd); -#define PERL_ARGS_ASSERT_SSC_INTERSECTION \ - assert(ssc); assert(invlist) -STATIC int S_ssc_is_anything(const regnode_ssc *ssc) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_SSC_IS_ANYTHING \ - assert(ssc) -STATIC int S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, const regnode_ssc *ssc) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT \ +PERL_CALLCONV void Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_SSC_INIT \ assert(pRExC_state); assert(ssc) -STATIC void S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_charclass *or_with); -#define PERL_ARGS_ASSERT_SSC_OR \ - assert(pRExC_state); assert(ssc); assert(or_with) -STATIC void S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert_2nd); -#define PERL_ARGS_ASSERT_SSC_UNION \ - assert(ssc); assert(invlist) -STATIC SSize_t S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U32 recursed_depth, regnode_ssc *and_withp, U32 flags, U32 depth, bool was_mutate_ok); +PERL_CALLCONV SSize_t Perl_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SSize_t *minlenp, SSize_t *deltap, regnode *last, struct scan_data_t *data, I32 stopparen, U32 recursed_depth, regnode_ssc *and_withp, U32 flags, U32 depth, bool was_mutate_ok) + __attribute__visibility__("hidden"); #define PERL_ARGS_ASSERT_STUDY_CHUNK \ assert(pRExC_state); assert(scanp); assert(minlenp); assert(deltap); assert(last) + +# if defined(PERL_IN_REGCOMP_TRIE_C) && defined(DEBUGGING) +STATIC void S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 depth); +#define PERL_ARGS_ASSERT_DUMP_TRIE \ + assert(trie); assert(revcharmap) +STATIC void S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 next_alloc, U32 depth); +#define PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST \ + assert(trie); assert(revcharmap) +STATIC void S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, HV* widecharmap, AV *revcharmap, U32 next_alloc, U32 depth); +#define PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE \ + assert(trie); assert(revcharmap) +# endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) PERL_CALLCONV void Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char* const indent, SV* const invlist); #define PERL_ARGS_ASSERT__INVLIST_DUMP \ assert(file); assert(indent); assert(invlist) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) __attribute__warn_unused_result__; @@ -6701,7 +6519,7 @@ PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, assert(invlist) #endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_UTF8_C) PERL_CALLCONV SV* Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT__ADD_RANGE_TO_INVLIST @@ -6731,35 +6549,7 @@ PERL_CALLCONV SV* Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV e assert(other_elements_ptr) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) -PERL_CALLCONV const char * Perl_form_alien_digit_msg(pTHX_ const U8 which, const STRLEN valids_len, const char * const first_bad, const char * const send, const bool UTF, const bool braced) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG \ - assert(first_bad); assert(send) - -PERL_CALLCONV bool Perl_grok_bslash_c(pTHX_ const char source, U8 * result, const char** message, U32 * packed_warn) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_GROK_BSLASH_C \ - assert(result); assert(message) - -PERL_CALLCONV bool Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_GROK_BSLASH_O \ - assert(s); assert(send); assert(uv); assert(message) - -PERL_CALLCONV bool Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_GROK_BSLASH_X \ - assert(s); assert(send); assert(uv); assert(message) - -#endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) -PERL_CALLCONV const char * Perl_form_cp_too_large_msg(pTHX_ const U8 which, const char * string, const Size_t len, const UV cp) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG - -#endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_OP_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) __attribute__warn_unused_result__; @@ -6785,7 +6575,7 @@ PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end) #endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) PERL_CALLCONV bool Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b); #define PERL_ARGS_ASSERT__INVLISTEQ \ assert(a); assert(b) @@ -6795,45 +6585,14 @@ PERL_CALLCONV SV* Perl__new_invlist_C_array(pTHX_ const UV* const list) assert(list) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_PP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE const char * S_get_regex_charset_name(const U32 flags, STRLEN* const lenp); #define PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME \ assert(lenp) #endif #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool Perl_check_regnode_after(pTHX_ const regnode* p, const STRLEN extra) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_CHECK_REGNODE_AFTER -#endif - -PERL_CALLCONV int Perl_re_printf(pTHX_ const char *fmt, ...) - __attribute__visibility__("hidden") - __attribute__format__(__printf__,pTHX_1,pTHX_2); -#define PERL_ARGS_ASSERT_RE_PRINTF \ - assert(fmt) - -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE regnode* Perl_regnext(pTHX_ const regnode* p) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_REGNEXT -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE regnode* Perl_regnode_after(pTHX_ const regnode* p, bool varies) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_REGNODE_AFTER -#endif - -PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) - __attribute__visibility__("hidden"); -#define PERL_ARGS_ASSERT_REGPROP \ - assert(sv); assert(o) - -#endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_DOOP_C) +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_DOOP_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S__invlist_contains_cp(SV* const invlist, const UV cp) __attribute__warn_unused_result__; @@ -6874,6 +6633,172 @@ PERL_STATIC_INLINE bool S_is_invlist(const SV* const invlist) #endif #endif +#if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_SV_C) +PERL_CALLCONV SV* Perl_invlist_clone(pTHX_ SV* const invlist, SV* newlist); +#define PERL_ARGS_ASSERT_INVLIST_CLONE \ + assert(invlist) +#endif +#if defined(PERL_IN_REGCOMP_C) +STATIC AV* S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count); +#define PERL_ARGS_ASSERT_ADD_MULTI_MATCH \ + assert(multi_string) +STATIC void S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size); +#define PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE \ + assert(pRExC_state) +STATIC REGEXP* S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len, const bool ignore_case) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_COMPILE_WILDCARD \ + assert(subpattern) + +STATIC U8 S_compute_EXACTish(RExC_state_t *pRExC_state); +#define PERL_ARGS_ASSERT_COMPUTE_EXACTISH \ + assert(pRExC_state) +STATIC int S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const STRLEN y, const SSize_t maxDistance) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_EDIT_DISTANCE \ + assert(src); assert(tgt) + +STATIC I32 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char* strend, char* strbeg, SSize_t minend, SV* screamer, U32 nosave); +#define PERL_ARGS_ASSERT_EXECUTE_WILDCARD \ + assert(prog); assert(stringarg); assert(strend); assert(strbeg); assert(screamer) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE Size_t S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max); +#define PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS \ + assert(s1); assert(s2) +#endif +STATIC U32 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, const char * start, const char * end); +#define PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE \ + assert(pRExC_state); assert(start); assert(end) +STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); +#define PERL_ARGS_ASSERT_GROK_BSLASH_N \ + assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * backref_parse_start, char ch); +#define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \ + assert(pRExC_state); assert(flagp); assert(backref_parse_start) +STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings); +#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \ + assert(wname); assert(prop_definition); assert(strings) +STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only); +#define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \ + assert(pRExC_state); assert(s) +STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth); +#define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \ + assert(pRExC_state); assert(flagp) +STATIC SV * S_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level); +#define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \ + assert(name); assert(contents); assert(user_defined_ptr); assert(msg) +STATIC void S_nextchar(pTHX_ RExC_state_t *pRExC_state); +#define PERL_ARGS_ASSERT_NEXTCHAR \ + assert(pRExC_state) +STATIC U8 S_optimize_regclass(pTHX_ RExC_state_t *pRExC_state, SV* cp_list, SV* only_utf8_locale_list, SV* upper_latin1_only_utf8_matches, const U32 has_runtime_dependency, const U32 posixl, U8 * anyof_flags, bool * invert, regnode_offset * ret, I32 *flagp); +#define PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS \ + assert(pRExC_state); assert(anyof_flags); assert(invert); assert(ret); assert(flagp) +STATIC void S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings); +#define PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS \ + assert(pRExC_state); assert(posix_warnings) +STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state); +#define PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS \ + assert(pRExC_state) +STATIC SV * S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, AV ** strings, bool * user_defined_ptr, SV * msg, const STRLEN level); +#define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING \ + assert(name); assert(user_defined_ptr); assert(msg) +PERL_STATIC_NO_RET void S_re_croak(pTHX_ bool utf8, const char* pat, ...) + __attribute__noreturn__ + __attribute__format__(__printf__,pTHX_2,pTHX_3); +#define PERL_ARGS_ASSERT_RE_CROAK \ + assert(pat) + +STATIC regnode_offset S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth); +#define PERL_ARGS_ASSERT_REG \ + assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2); +#define PERL_ARGS_ASSERT_REG2LANODE \ + assert(pRExC_state) +STATIC regnode_offset S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, const char *type); +#define PERL_ARGS_ASSERT_REG_LA_NOTHING \ + assert(pRExC_state); assert(type) +STATIC regnode_offset S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags, const char *type); +#define PERL_ARGS_ASSERT_REG_LA_OPFAIL \ + assert(pRExC_state); assert(type) +STATIC regnode_offset S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op); +#define PERL_ARGS_ASSERT_REG_NODE \ + assert(pRExC_state) +STATIC SV * S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags); +#define PERL_ARGS_ASSERT_REG_SCAN_NAME \ + assert(pRExC_state) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE char * S_reg_skipcomment(RExC_state_t *pRExC_state, char * p); +#define PERL_ARGS_ASSERT_REG_SKIPCOMMENT \ + assert(pRExC_state); assert(p) +#endif +STATIC regnode_offset S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg); +#define PERL_ARGS_ASSERT_REGANODE \ + assert(pRExC_state) +STATIC regnode_offset S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); +#define PERL_ARGS_ASSERT_REGATOM \ + assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth); +#define PERL_ARGS_ASSERT_REGBRANCH \ + assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, bool optimizable, SV** ret_invlist); +#define PERL_ARGS_ASSERT_REGCLASS \ + assert(pRExC_state); assert(flagp) +STATIC unsigned int S_regex_set_precedence(const U8 my_operator) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_REGEX_SET_PRECEDENCE + +STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, const regnode_offset operand, const U32 depth); +#define PERL_ARGS_ASSERT_REGINSERT \ + assert(pRExC_state) +STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_len); +#define PERL_ARGS_ASSERT_REGNODE_GUTS \ + assert(pRExC_state) +STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); +#define PERL_ARGS_ASSERT_REGPIECE \ + assert(pRExC_state); assert(flagp) +STATIC regnode_offset S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg); +#define PERL_ARGS_ASSERT_REGPNODE \ + assert(pRExC_state); assert(arg) +STATIC bool S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, const U32 depth) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_REGTAIL \ + assert(pRExC_state); assert(p); assert(val) + +STATIC void S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx); +#define PERL_ARGS_ASSERT_SET_REGEX_PV \ + assert(pRExC_state); assert(Rx) +STATIC void S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, char ** p, const bool force_to_xmod); +#define PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT \ + assert(pRExC_state); assert(p) +#endif +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) +PERL_CALLCONV const char * Perl_form_alien_digit_msg(pTHX_ const U8 which, const STRLEN valids_len, const char * const first_bad, const char * const send, const bool UTF, const bool braced) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG \ + assert(first_bad); assert(send) + +PERL_CALLCONV bool Perl_grok_bslash_c(pTHX_ const char source, U8 * result, const char** message, U32 * packed_warn) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GROK_BSLASH_C \ + assert(result); assert(message) + +PERL_CALLCONV bool Perl_grok_bslash_o(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GROK_BSLASH_O \ + assert(s); assert(send); assert(uv); assert(message) + +PERL_CALLCONV bool Perl_grok_bslash_x(pTHX_ char** s, const char* const send, UV* uv, const char** message, U32 * packed_warn, const bool strict, const bool allow_UV_MAX, const bool utf8) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GROK_BSLASH_X \ + assert(s); assert(send); assert(uv); assert(message) + +#endif +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_DQUOTE_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) +PERL_CALLCONV const char * Perl_form_cp_too_large_msg(pTHX_ const U8 which, const char * string, const Size_t len, const UV cp) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG + +#endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) PERL_CALLCONV bool Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 *strend, const UV cp) __attribute__warn_unused_result__ @@ -6887,11 +6812,6 @@ PERL_CALLCONV UV Perl__to_fold_latin1(const U8 c, U8 *p, STRLEN *lenp, const uns #define PERL_ARGS_ASSERT__TO_FOLD_LATIN1 \ assert(p); assert(lenp) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_SV_C) -PERL_CALLCONV SV* Perl_invlist_clone(pTHX_ SV* const invlist, SV* newlist); -#define PERL_ARGS_ASSERT_INVLIST_CLONE \ - assert(invlist) -#endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) PERL_CALLCONV bool Perl_regcurly(const char *s, const char *e, const char * result[5]) __attribute__warn_unused_result__; @@ -6899,6 +6819,89 @@ PERL_CALLCONV bool Perl_regcurly(const char *s, const char *e, const char * resu assert(s); assert(e) #endif +#if defined(PERL_IN_REGCOMP_DEBUG_C) && defined(DEBUGGING) +STATIC U8 S_put_charclass_bitmap_innards(pTHX_ SV* sv, char* bitmap, SV* nonbitmap_invlist, SV* only_utf8_locale_invlist, const regnode * const node, const U8 flags, const bool force_as_is_display); +#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS \ + assert(sv) +STATIC SV* S_put_charclass_bitmap_innards_common(pTHX_ SV* invlist, SV* posixes, SV* only_utf8, SV* not_utf8, SV* only_utf8_locale, const bool invert); +#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON \ + assert(invlist) +STATIC void S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist); +#define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST \ + assert(sv); assert(invlist) +STATIC void S_put_code_point(pTHX_ SV* sv, UV c); +#define PERL_ARGS_ASSERT_PUT_CODE_POINT \ + assert(sv) +STATIC void S_put_range(pTHX_ SV* sv, UV start, const UV end, const bool allow_literals); +#define PERL_ARGS_ASSERT_PUT_RANGE \ + assert(sv) +STATIC void S_regdump_extflags(pTHX_ const char *lead, const U32 flags); +#define PERL_ARGS_ASSERT_REGDUMP_EXTFLAGS +STATIC void S_regdump_intflags(pTHX_ const char *lead, const U32 flags); +#define PERL_ARGS_ASSERT_REGDUMP_INTFLAGS +#endif +#if defined(PERL_IN_REGCOMP_INVLIST_C) || defined(PERL_IN_REGCOMP_C) +PERL_CALLCONV void Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST \ + assert(invlist); assert(bitmap) + +PERL_CALLCONV void Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP \ + assert(bitmap); assert(invlist) + +#endif +#if defined(PERL_IN_REGCOMP_STUDY_C) +STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); +#define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ + assert(pRExC_state); assert(node) +STATIC SV* S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST \ + assert(pRExC_state); assert(node) + +STATIC void S_rck_elide_nothing(pTHX_ regnode *node); +#define PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING \ + assert(node) +STATIC void S_ssc_add_range(pTHX_ regnode_ssc *ssc, UV const start, UV const end); +#define PERL_ARGS_ASSERT_SSC_ADD_RANGE \ + assert(ssc) +STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_charclass *and_with); +#define PERL_ARGS_ASSERT_SSC_AND \ + assert(pRExC_state); assert(ssc); assert(and_with) +STATIC void S_ssc_anything(pTHX_ regnode_ssc *ssc); +#define PERL_ARGS_ASSERT_SSC_ANYTHING \ + assert(ssc) +STATIC void S_ssc_clear_locale(regnode_ssc *ssc); +#define PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE \ + assert(ssc) +STATIC void S_ssc_cp_and(pTHX_ regnode_ssc *ssc, UV const cp); +#define PERL_ARGS_ASSERT_SSC_CP_AND \ + assert(ssc) +STATIC void S_ssc_intersection(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert_2nd); +#define PERL_ARGS_ASSERT_SSC_INTERSECTION \ + assert(ssc); assert(invlist) +STATIC int S_ssc_is_anything(const regnode_ssc *ssc) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_SSC_IS_ANYTHING \ + assert(ssc) + +STATIC int S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, const regnode_ssc *ssc) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT \ + assert(pRExC_state); assert(ssc) + +STATIC void S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_charclass *or_with); +#define PERL_ARGS_ASSERT_SSC_OR \ + assert(pRExC_state); assert(ssc); assert(or_with) +STATIC void S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert_2nd); +#define PERL_ARGS_ASSERT_SSC_UNION \ + assert(ssc); assert(invlist) +STATIC void S_unwind_scan_frames(pTHX_ const void *p); +#define PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES \ + assert(p) +#endif #if defined(PERL_IN_REGEXEC_C) STATIC LB_enum S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) __attribute__warn_unused_result__; @@ -7045,6 +7048,64 @@ STATIC void S_to_utf8_substr(pTHX_ regexp * prog); #define PERL_ARGS_ASSERT_TO_UTF8_SUBSTR \ assert(prog) #endif +#if defined(PERL_IN_REGEX_ENGINE) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE bool Perl_check_regnode_after(pTHX_ const regnode* p, const STRLEN extra) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_CHECK_REGNODE_AFTER +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE regnode* Perl_regnext(pTHX_ const regnode* p) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_REGNEXT +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE regnode* Perl_regnode_after(pTHX_ const regnode* p, bool varies) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_REGNODE_AFTER +#endif + +#endif +#if defined(PERL_IN_REGEX_ENGINE) && defined(DEBUGGING) +PERL_CALLCONV void Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, regnode *scan, U32 depth, U32 flags) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_DEBUG_PEEP \ + assert(str); assert(pRExC_state) + +PERL_CALLCONV void Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, const char *close_str) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS \ + assert(open_str); assert(close_str) + +PERL_CALLCONV void Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data, U32 depth, int is_inf, SSize_t min, SSize_t stopmin, SSize_t delta) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_DEBUG_STUDYDATA \ + assert(where) + +PERL_CALLCONV const regnode* Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_DUMPUNTIL \ + assert(r); assert(start); assert(node); assert(sv) + +PERL_CALLCONV int Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_RE_INDENTF \ + assert(fmt) + +PERL_CALLCONV int Perl_re_printf(pTHX_ const char *fmt, ...) + __attribute__visibility__("hidden") + __attribute__format__(__printf__,pTHX_1,pTHX_2); +#define PERL_ARGS_ASSERT_RE_PRINTF \ + assert(fmt) + +PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) + __attribute__visibility__("hidden"); +#define PERL_ARGS_ASSERT_REGPROP \ + assert(sv); assert(o) + +#endif #if defined(PERL_IN_SCOPE_C) STATIC void S_save_pushptri32ptr(pTHX_ void *const ptr1, const I32 i, void *const ptr2, const int type); #define PERL_ARGS_ASSERT_SAVE_PUSHPTRI32PTR @@ -31,29 +31,25 @@ * with the POSIX routines of the same names. */ -#ifdef PERL_EXT_RE_BUILD -#include "re_top.h" -#endif - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. * * **** Alterations to Henry's code are... @@ -127,11 +123,16 @@ * access data that we don't want to duplicate. */ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + #include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY #define PERL_IN_REGCOMP_C #include "perl.h" -#define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" EXTERN_C const struct regexp_engine my_reg_engine; @@ -142,1349 +143,7 @@ EXTERN_C const struct regexp_engine wild_reg_engine; #include "invlist_inline.h" #include "unicode_constants.h" - -#ifndef STATIC -#define STATIC static -#endif - -/* this is a chain of data about sub patterns we are processing that - need to be handled separately/specially in study_chunk. Its so - we can simulate recursion without losing state. */ -struct scan_frame; -typedef struct scan_frame { - regnode *last_regnode; /* last node to process in this frame */ - regnode *next_regnode; /* next node to process when last is reached */ - U32 prev_recursed_depth; - I32 stopparen; /* what stopparen do we use */ - bool in_gosub; /* this or an outer frame is for GOSUB */ - - struct scan_frame *this_prev_frame; /* this previous frame */ - struct scan_frame *prev_frame; /* previous frame */ - struct scan_frame *next_frame; /* next frame */ -} scan_frame; - -/* Certain characters are output as a sequence with the first being a - * backslash. */ -#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) - - -struct RExC_state_t { - U32 flags; /* RXf_* are we folding, multilining? */ - U32 pm_flags; /* PMf_* stuff from the calling PMOP */ - char *precomp; /* uncompiled string. */ - char *precomp_end; /* pointer to end of uncompiled string. */ - REGEXP *rx_sv; /* The SV that is the regexp. */ - regexp *rx; /* perl core regexp structure */ - regexp_internal *rxi; /* internal data for regexp object - pprivate field */ - char *start; /* Start of input for compile */ - char *end; /* End of input for compile */ - char *parse; /* Input-scan pointer. */ - char *copy_start; /* start of copy of input within - constructed parse string */ - char *save_copy_start; /* Provides one level of saving - and restoring 'copy_start' */ - char *copy_start_in_input; /* Position in input string - corresponding to copy_start */ - SSize_t whilem_seen; /* number of WHILEM in this expr */ - regnode *emit_start; /* Start of emitted-code area */ - regnode_offset emit; /* Code-emit pointer */ - I32 naughty; /* How bad is this pattern? */ - I32 sawback; /* Did we see \1, ...? */ - SSize_t size; /* Number of regnode equivalents in - pattern */ - Size_t sets_depth; /* Counts recursion depth of already- - compiled regex set patterns */ - U32 seen; - - I32 parens_buf_size; /* #slots malloced open/close_parens */ - regnode_offset *open_parens; /* offsets to open parens */ - regnode_offset *close_parens; /* offsets to close parens */ - HV *paren_names; /* Paren names */ - - /* position beyond 'precomp' of the warning message furthest away from - * 'precomp'. During the parse, no warnings are raised for any problems - * earlier in the parse than this position. This works if warnings are - * raised the first time a given spot is parsed, and if only one - * independent warning is raised for any given spot */ - Size_t latest_warn_offset; - - I32 npar; /* Capture buffer count so far in the - parse, (OPEN) plus one. ("par" 0 is - the whole pattern)*/ - I32 total_par; /* During initial parse, is either 0, - or -1; the latter indicating a - reparse is needed. After that pass, - it is what 'npar' became after the - pass. Hence, it being > 0 indicates - we are in a reparse situation */ - I32 nestroot; /* root parens we are in - used by - accept */ - I32 seen_zerolen; - regnode *end_op; /* END node in program */ - I32 utf8; /* whether the pattern is utf8 or not */ - I32 orig_utf8; /* whether the pattern was originally in utf8 */ - /* XXX use this for future optimisation of case - * where pattern must be upgraded to utf8. */ - I32 uni_semantics; /* If a d charset modifier should use unicode - rules, even if the pattern is not in - utf8 */ - - I32 recurse_count; /* Number of recurse regops we have generated */ - regnode **recurse; /* Recurse regops */ - U8 *study_chunk_recursed; /* bitmap of which subs we have moved - through */ - U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - I32 in_lookaround; - I32 contains_locale; - I32 override_recoding; - I32 recode_x_to_native; - I32 in_multi_char_class; - int code_index; /* next code_blocks[] slot */ - struct reg_code_blocks *code_blocks;/* positions of literal (?{}) - within pattern */ - SSize_t maxlen; /* mininum possible number of chars in string to match */ - scan_frame *frame_head; - scan_frame *frame_last; - U32 frame_count; - AV *warn_text; - HV *unlexed_names; - SV *runtime_code_qr; /* qr with the runtime code blocks */ -#ifdef DEBUGGING - const char *lastparse; - I32 lastnum; - U32 study_chunk_recursed_count; - AV *paren_name_list; /* idx -> name */ - SV *mysv1; - SV *mysv2; - -#define RExC_lastparse (pRExC_state->lastparse) -#define RExC_lastnum (pRExC_state->lastnum) -#define RExC_paren_name_list (pRExC_state->paren_name_list) -#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) -#define RExC_mysv (pRExC_state->mysv1) -#define RExC_mysv1 (pRExC_state->mysv1) -#define RExC_mysv2 (pRExC_state->mysv2) - -#endif - bool seen_d_op; - bool strict; - bool study_started; - bool in_script_run; - bool use_BRANCHJ; - bool sWARN_EXPERIMENTAL__VLB; - bool sWARN_EXPERIMENTAL__REGEX_SETS; -}; - -#define RExC_flags (pRExC_state->flags) -#define RExC_pm_flags (pRExC_state->pm_flags) -#define RExC_precomp (pRExC_state->precomp) -#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) -#define RExC_copy_start_in_constructed (pRExC_state->copy_start) -#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) -#define RExC_precomp_end (pRExC_state->precomp_end) -#define RExC_rx_sv (pRExC_state->rx_sv) -#define RExC_rx (pRExC_state->rx) -#define RExC_rxi (pRExC_state->rxi) -#define RExC_start (pRExC_state->start) -#define RExC_end (pRExC_state->end) -#define RExC_parse (pRExC_state->parse) -#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) -#define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs - under /d from /u ? */ - -#define RExC_emit (pRExC_state->emit) -#define RExC_emit_start (pRExC_state->emit_start) -#define RExC_sawback (pRExC_state->sawback) -#define RExC_seen (pRExC_state->seen) -#define RExC_size (pRExC_state->size) -#define RExC_maxlen (pRExC_state->maxlen) -#define RExC_npar (pRExC_state->npar) -#define RExC_total_parens (pRExC_state->total_par) -#define RExC_parens_buf_size (pRExC_state->parens_buf_size) -#define RExC_nestroot (pRExC_state->nestroot) -#define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_utf8 (pRExC_state->utf8) -#define RExC_uni_semantics (pRExC_state->uni_semantics) -#define RExC_orig_utf8 (pRExC_state->orig_utf8) -#define RExC_open_parens (pRExC_state->open_parens) -#define RExC_close_parens (pRExC_state->close_parens) -#define RExC_end_op (pRExC_state->end_op) -#define RExC_paren_names (pRExC_state->paren_names) -#define RExC_recurse (pRExC_state->recurse) -#define RExC_recurse_count (pRExC_state->recurse_count) -#define RExC_sets_depth (pRExC_state->sets_depth) -#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) -#define RExC_study_chunk_recursed_bytes \ - (pRExC_state->study_chunk_recursed_bytes) -#define RExC_in_lookaround (pRExC_state->in_lookaround) -#define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) - -#ifdef EBCDIC -# define SET_recode_x_to_native(x) \ - STMT_START { RExC_recode_x_to_native = (x); } STMT_END -#else -# define SET_recode_x_to_native(x) NOOP -#endif - -#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) -#define RExC_frame_head (pRExC_state->frame_head) -#define RExC_frame_last (pRExC_state->frame_last) -#define RExC_frame_count (pRExC_state->frame_count) -#define RExC_strict (pRExC_state->strict) -#define RExC_study_started (pRExC_state->study_started) -#define RExC_warn_text (pRExC_state->warn_text) -#define RExC_in_script_run (pRExC_state->in_script_run) -#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) -#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) -#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) -#define RExC_unlexed_names (pRExC_state->unlexed_names) - - -/***********************************************************************/ -/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse - * - * All of these macros depend on the above RExC_ accessor macros, which - * in turns depend on a variable pRExC_state being in scope where they - * are used. This is the standard regexp parser context variable which is - * passed into every non-trivial parse function in this file. - * - * Note that the UTF macro is itself a wrapper around RExC_utf8, so all - * of the macros which do not take an argument will operate on the - * pRExC_state structure *only*. - * - * Please do NOT modify RExC_parse without using these macros. In the - * future these macros will be extended for enhanced debugging and trace - * output during the parse process. - */ - -/* RExC_parse_incf(flag) - * - * Increment RExC_parse to point at the next codepoint, while doing - * the right thing depending on whether we are parsing UTF-8 strings - * or not. The 'flag' argument determines if content is UTF-8 or not, - * intended for cases where this is NOT governed by the UTF macro. - * - * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. - * - * WARNING: Does NOT take into account RExC_end; it is the callers - * responsibility to make sure there are enough octets left in - * RExC_parse to ensure that when processing UTF-8 we would not read - * past the end of the string. - */ -#define RExC_parse_incf(flag) STMT_START { \ - RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ -} STMT_END - -/* RExC_parse_inc_safef(flag) - * - * Safely increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not and NOT reading past the end of the buffer. - * The 'flag' argument determines if content is UTF-8 or not, - * intended for cases where this is NOT governed by the UTF macro. - * - * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. - * - * NOTE: Will NOT read past RExC_end when content is UTF-8. - */ -#define RExC_parse_inc_safef(flag) STMT_START { \ - RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ -} STMT_END - -/* RExC_parse_inc() - * - * Increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not. - * - * WARNING: Does NOT take into account RExC_end, it is the callers - * responsibility to make sure there are enough octets left in - * RExC_parse to ensure that when processing UTF-8 we would not read - * past the end of the string. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc() RExC_parse_incf(UTF) - -/* RExC_parse_inc_safe() - * - * Safely increment RExC_parse to point at the next codepoint, - * doing the right thing depending on whether we are parsing - * UTF-8 strings or not and NOT reading past the end of the buffer. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) - -/* RExC_parse_inc_utf8() - * - * Increment RExC_parse to point at the next utf8 codepoint, - * assumes content is UTF-8. - * - * WARNING: Does NOT take into account RExC_end; it is the callers - * responsibility to make sure there are enough octets left in RExC_parse - * to ensure that when processing UTF-8 we would not read past the end - * of the string. - */ -#define RExC_parse_inc_utf8() STMT_START { \ - RExC_parse += UTF8SKIP(RExC_parse); \ -} STMT_END - -/* RExC_parse_inc_if_char() - * - * Increment RExC_parse to point at the next codepoint, if and only - * if the current parse point is NOT a NULL, while doing the right thing - * depending on whether we are parsing UTF-8 strings or not. - * - * WARNING: Does NOT take into account RExC_end, it is the callers - * responsibility to make sure there are enough octets left in RExC_parse - * to ensure that when processing UTF-8 we would not read past the end - * of the string. - * - * NOTE: whether we are parsing UTF-8 or not is determined by the - * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this - * macro operates on the pRExC_state structure only. - */ -#define RExC_parse_inc_if_char() STMT_START { \ - RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ -} STMT_END - -/* RExC_parse_inc_by(n_octets) - * - * Increment the parse cursor by the number of octets specified by - * the 'n_octets' argument. - * - * NOTE: Does NOT check ANY constraints. It is the callers responsibility - * that this will not move past the end of the string, or leave the - * pointer in the middle of a UTF-8 sequence. - * - * Typically used to advanced past previously analyzed content. - */ -#define RExC_parse_inc_by(n_octets) STMT_START { \ - RExC_parse += (n_octets); \ -} STMT_END - -/* RExC_parse_set(to_ptr) - * - * Sets the RExC_parse pointer to the pointer specified by the 'to' - * argument. No validation whatsoever is performed on the to pointer. - */ -#define RExC_parse_set(to_ptr) STMT_START { \ - RExC_parse = (to_ptr); \ -} STMT_END - -/**********************************************************************/ - -/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set - * a flag to disable back-off on the fixed/floating substrings - if it's - * a high complexity pattern we assume the benefit of avoiding a full match - * is worth the cost of checking for the substrings even if they rarely help. - */ -#define RExC_naughty (pRExC_state->naughty) -#define TOO_NAUGHTY (10) -#define MARK_NAUGHTY(add) \ - if (RExC_naughty < TOO_NAUGHTY) \ - RExC_naughty += (add) -#define MARK_NAUGHTY_EXP(exp, add) \ - if (RExC_naughty < TOO_NAUGHTY) \ - RExC_naughty += RExC_naughty / (exp) + (add) - -#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') -#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ - || ((*s) == '{' && regcurly(s, e, NULL))) - -/* - * Flags to be passed up. - */ -#define HASWIDTH 0x01 /* Known to not match null strings, could match - non-null ones. */ -#define SIMPLE 0x02 /* Exactly one character wide */ - /* (or LNBREAK as a special case) */ -#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ -#define TRYAGAIN 0x10 /* Weeded out a declaration. */ -#define RESTART_PARSE 0x20 /* Need to redo the parse */ -#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to - calcuate sizes as UTF-8 */ - -#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) - -/* whether trie related optimizations are enabled */ -#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION -#define TRIE_STUDY_OPT -#define FULL_TRIE_STUDY -#define TRIE_STCLASS -#endif - -/* About the term "restudy" and the var "restudied" and the defines - * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to - * doing multiple study_chunk() calls over the same set of opcodes for* the - * purpose of enhanced TRIE optimizations. - * - * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal - * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled - * which then causes the Perl_re_op_compile() to then call the optimizer - * S_study_chunk() a second time to perform additional optimizations, - * including the aho_corasick startclass optimization. - * This additional pass will only happen once, which is managed by the - * 'restudied' variable in Perl_re_op_compile(). - * - * When this second pass is under way the flags passed into study_chunk() will - * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down - * to any recursive calls to S_study_chunk(). - * - * IMPORTANT: Any logic in study_chunk() that emits warnings should check that - * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may - * be produced twice. - * - * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and - * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. - */ - - -#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] -#define PBITVAL(paren) (1 << ((paren) & 7)) -#define PAREN_OFFSET(depth) \ - (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) -#define PAREN_TEST(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) -#define PAREN_SET(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) -#define PAREN_UNSET(depth, paren) \ - (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) - -#define REQUIRE_UTF8(flagp) STMT_START { \ - if (!UTF) { \ - *flagp = RESTART_PARSE|NEED_UTF8; \ - return 0; \ - } \ - } STMT_END - -/* /u is to be chosen if we are supposed to use Unicode rules, or if the - * pattern is in UTF-8. This latter condition is in case the outermost rules - * are locale. See GH #17278 */ -#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) - -/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is - * a flag that indicates we need to override /d with /u as a result of - * something in the pattern. It should only be used in regards to calling - * set_regex_charset() or get_regex_charset() */ -#define REQUIRE_UNI_RULES(flagp, restart_retval) \ - STMT_START { \ - if (DEPENDS_SEMANTICS) { \ - set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ - RExC_uni_semantics = 1; \ - if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ - /* No need to restart the parse if we haven't seen \ - * anything that differs between /u and /d, and no need \ - * to restart immediately if we're going to reparse \ - * anyway to count parens */ \ - *flagp |= RESTART_PARSE; \ - return restart_retval; \ - } \ - } \ - } STMT_END - -#define REQUIRE_BRANCHJ(flagp, restart_retval) \ - STMT_START { \ - RExC_use_BRANCHJ = 1; \ - *flagp |= RESTART_PARSE; \ - return restart_retval; \ - } STMT_END - -/* Until we have completed the parse, we leave RExC_total_parens at 0 or - * less. After that, it must always be positive, because the whole re is - * considered to be surrounded by virtual parens. Setting it to negative - * indicates there is some construct that needs to know the actual number of - * parens to be properly handled. And that means an extra pass will be - * required after we've counted them all */ -#define ALL_PARENS_COUNTED (RExC_total_parens > 0) -#define REQUIRE_PARENS_PASS \ - STMT_START { /* No-op if have completed a pass */ \ - if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ - } STMT_END -#define IN_PARENS_PASS (RExC_total_parens < 0) - - -/* This is used to return failure (zero) early from the calling function if - * various flags in 'flags' are set. Two flags always cause a return: - * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any - * additional flags that should cause a return; 0 if none. If the return will - * be done, '*flagp' is first set to be all of the flags that caused the - * return. */ -#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ - STMT_START { \ - if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ - *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ - return 0; \ - } \ - } STMT_END - -#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) - -#define RETURN_FAIL_ON_RESTART(flags,flagp) \ - RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) -#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ - if (MUST_RESTART(*(flagp))) return 0 - -/* This converts the named class defined in regcomp.h to its equivalent class - * number defined in handy.h. */ -#define namedclass_to_classnum(class) ((int) ((class) / 2)) -#define classnum_to_namedclass(classnum) ((classnum) * 2) - -#define _invlist_union_complement_2nd(a, b, output) \ - _invlist_union_maybe_complement_2nd(a, b, TRUE, output) -#define _invlist_intersection_complement_2nd(a, b, output) \ - _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) - -/* We add a marker if we are deferring expansion of a property that is both - * 1) potentiallly user-defined; and - * 2) could also be an official Unicode property. - * - * Without this marker, any deferred expansion can only be for a user-defined - * one. This marker shouldn't conflict with any that could be in a legal name, - * and is appended to its name to indicate this. There is a string and - * character form */ -#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" -#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' - -/* What is infinity for optimization purposes */ -#define OPTIMIZE_INFTY SSize_t_MAX - -/* About scan_data_t. - - During optimisation we recurse through the regexp program performing - various inplace (keyhole style) optimisations. In addition study_chunk - and scan_commit populate this data structure with information about - what strings MUST appear in the pattern. We look for the longest - string that must appear at a fixed location, and we look for the - longest string that may appear at a floating location. So for instance - in the pattern: - - /FOO[xX]A.*B[xX]BAR/ - - Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating - strings (because they follow a .* construct). study_chunk will identify - both FOO and BAR as being the longest fixed and floating strings respectively. - - The strings can be composites, for instance - - /(f)(o)(o)/ - - will result in a composite fixed substring 'foo'. - - For each string some basic information is maintained: - - - min_offset - This is the position the string must appear at, or not before. - It also implicitly (when combined with minlenp) tells us how many - characters must match before the string we are searching for. - Likewise when combined with minlenp and the length of the string it - tells us how many characters must appear after the string we have - found. - - - max_offset - Only used for floating strings. This is the rightmost point that - the string can appear at. If set to OPTIMIZE_INFTY it indicates that the - string can occur infinitely far to the right. - For fixed strings, it is equal to min_offset. - - - minlenp - A pointer to the minimum number of characters of the pattern that the - string was found inside. This is important as in the case of positive - lookahead or positive lookbehind we can have multiple patterns - involved. Consider - - /(?=FOO).*F/ - - The minimum length of the pattern overall is 3, the minimum length - of the lookahead part is 3, but the minimum length of the part that - will actually match is 1. So 'FOO's minimum length is 3, but the - minimum length for the F is 1. This is important as the minimum length - is used to determine offsets in front of and behind the string being - looked for. Since strings can be composites this is the length of the - pattern at the time it was committed with a scan_commit. Note that - the length is calculated by study_chunk, so that the minimum lengths - are not known until the full pattern has been compiled, thus the - pointer to the value. - - - lookbehind - - In the case of lookbehind the string being searched for can be - offset past the start point of the final matching string. - If this value was just blithely removed from the min_offset it would - invalidate some of the calculations for how many chars must match - before or after (as they are derived from min_offset and minlen and - the length of the string being searched for). - When the final pattern is compiled and the data is moved from the - scan_data_t structure into the regexp structure the information - about lookbehind is factored in, with the information that would - have been lost precalculated in the end_shift field for the - associated string. - - The fields pos_min and pos_delta are used to store the minimum offset - and the delta to the maximum offset at the current point in the pattern. - -*/ - -struct scan_data_substrs { - SV *str; /* longest substring found in pattern */ - SSize_t min_offset; /* earliest point in string it can appear */ - SSize_t max_offset; /* latest point in string it can appear */ - SSize_t *minlenp; /* pointer to the minlen relevant to the string */ - SSize_t lookbehind; /* is the pos of the string modified by LB */ - I32 flags; /* per substring SF_* and SCF_* flags */ -}; - -typedef struct scan_data_t { - /*I32 len_min; unused */ - /*I32 len_delta; unused */ - SSize_t pos_min; - SSize_t pos_delta; - SV *last_found; - SSize_t last_end; /* min value, <0 unless valid. */ - SSize_t last_start_min; - SSize_t last_start_max; - U8 cur_is_floating; /* whether the last_* values should be set as - * the next fixed (0) or floating (1) - * substring */ - - /* [0] is longest fixed substring so far, [1] is longest float so far */ - struct scan_data_substrs substrs[2]; - - I32 flags; /* common SF_* and SCF_* flags */ - I32 whilem_c; - SSize_t *last_closep; - regnode **last_close_opp; /* pointer to pointer to last CLOSE regop - seen. DO NOT DEREFERENCE the regnode - pointer - the op may have been optimized - away */ - regnode_ssc *start_class; -} scan_data_t; - -/* - * Forward declarations for pregcomp()'s friends. - */ - -static const scan_data_t zero_scan_data = { - 0, 0, NULL, 0, 0, 0, 0, - { - { NULL, 0, 0, 0, 0, 0 }, - { NULL, 0, 0, 0, 0, 0 }, - }, - 0, 0, NULL, NULL, NULL -}; - -/* study flags */ - -#define SF_BEFORE_SEOL 0x0001 -#define SF_BEFORE_MEOL 0x0002 -#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) - -#define SF_IS_INF 0x0040 -#define SF_HAS_PAR 0x0080 -#define SF_IN_PAR 0x0100 -#define SF_HAS_EVAL 0x0200 - - -/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the - * longest substring in the pattern. When it is not set the optimiser keeps - * track of position, but does not keep track of the actual strings seen, - * - * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but - * /foo/i will not. - * - * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" - * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be - * turned off because of the alternation (BRANCH). */ -#define SCF_DO_SUBSTR 0x0400 - -#define SCF_DO_STCLASS_AND 0x0800 -#define SCF_DO_STCLASS_OR 0x1000 -#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) -#define SCF_WHILEM_VISITED_POS 0x2000 - -#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? - Search for "restudy" in this file - to find a detailed explanation.*/ -#define SCF_SEEN_ACCEPT 0x8000 -#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? - Search for "restudy" in this file - to find a detailed explanation. */ -#define SCF_IN_DEFINE 0x20000 - - - -#define UTF cBOOL(RExC_utf8) - -/* The enums for all these are ordered so things work out correctly */ -#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ - == REGEX_DEPENDS_CHARSET) -#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) -#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ - >= REGEX_UNICODE_CHARSET) -#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ - == REGEX_ASCII_RESTRICTED_CHARSET) -#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ - >= REGEX_ASCII_RESTRICTED_CHARSET) -#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ - == REGEX_ASCII_MORE_RESTRICTED_CHARSET) - -#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) - -/* For programs that want to be strictly Unicode compatible by dying if any - * attempt is made to match a non-Unicode code point against a Unicode - * property. */ -#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) - -#define OOB_NAMEDCLASS -1 - -/* There is no code point that is out-of-bounds, so this is problematic. But - * its only current use is to initialize a variable that is always set before - * looked at. */ -#define OOB_UNICODE 0xDEADBEEF - -#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) - - -/* length of regex to show in messages that don't mark a position within */ -#define RegexLengthToShowInErrorMessages 127 - -/* - * If MARKER[12] are adjusted, be sure to adjust the constants at the top - * of t/op/regmesg.t, the tests in t/op/re_tests, and those in - * op/pragma/warn/regcomp. - */ -#define MARKER1 "<-- HERE" /* marker as it appears in the description */ -#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ - -#define REPORT_LOCATION " in regex; marked by " MARKER1 \ - " in m/%" UTF8f MARKER2 "%" UTF8f "/" - -/* The code in this file in places uses one level of recursion with parsing - * rebased to an alternate string constructed by us in memory. This can take - * the form of something that is completely different from the input, or - * something that uses the input as part of the alternate. In the first case, - * there should be no possibility of an error, as we are in complete control of - * the alternate string. But in the second case we don't completely control - * the input portion, so there may be errors in that. Here's an example: - * /[abc\x{DF}def]/ui - * is handled specially because \x{df} folds to a sequence of more than one - * character: 'ss'. What is done is to create and parse an alternate string, - * which looks like this: - * /(?:\x{DF}|[abc\x{DF}def])/ui - * where it uses the input unchanged in the middle of something it constructs, - * which is a branch for the DF outside the character class, and clustering - * parens around the whole thing. (It knows enough to skip the DF inside the - * class while in this substitute parse.) 'abc' and 'def' may have errors that - * need to be reported. The general situation looks like this: - * - * |<------- identical ------>| - * sI tI xI eI - * Input: --------------------------------------------------------------- - * Constructed: --------------------------------------------------- - * sC tC xC eC EC - * |<------- identical ------>| - * - * sI..eI is the portion of the input pattern we are concerned with here. - * sC..EC is the constructed substitute parse string. - * sC..tC is constructed by us - * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. - * In the diagram, these are vertically aligned. - * eC..EC is also constructed by us. - * xC is the position in the substitute parse string where we found a - * problem. - * xI is the position in the original pattern corresponding to xC. - * - * We want to display a message showing the real input string. Thus we need to - * translate from xC to xI. We know that xC >= tC, since the portion of the - * string sC..tC has been constructed by us, and so shouldn't have errors. We - * get: - * xI = tI + (xC - tC) - * - * When the substitute parse is constructed, the code needs to set: - * RExC_start (sC) - * RExC_end (eC) - * RExC_copy_start_in_input (tI) - * RExC_copy_start_in_constructed (tC) - * and restore them when done. - * - * During normal processing of the input pattern, both - * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to - * sI, so that xC equals xI. - */ - -#define sI RExC_precomp -#define eI RExC_precomp_end -#define sC RExC_start -#define eC RExC_end -#define tI RExC_copy_start_in_input -#define tC RExC_copy_start_in_constructed -#define xI(xC) (tI + (xC - tC)) -#define xI_offset(xC) (xI(xC) - sI) - -#define REPORT_LOCATION_ARGS(xC) \ - UTF8fARG(UTF, \ - (xI(xC) > eI) /* Don't run off end */ \ - ? eI - sI /* Length before the <--HERE */ \ - : ((xI_offset(xC) >= 0) \ - ? xI_offset(xC) \ - : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ - IVdf " trying to output message for " \ - " pattern %.*s", \ - __FILE__, __LINE__, (IV) xI_offset(xC), \ - ((int) (eC - sC)), sC), 0)), \ - sI), /* The input pattern printed up to the <--HERE */ \ - UTF8fARG(UTF, \ - (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ - (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ - -/* Used to point after bad bytes for an error message, but avoid skipping - * past a nul byte. */ -#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) - -/* Set up to clean up after our imminent demise */ -#define PREPARE_TO_DIE \ - STMT_START { \ - if (RExC_rx_sv) \ - SAVEFREESV(RExC_rx_sv); \ - if (RExC_open_parens) \ - SAVEFREEPV(RExC_open_parens); \ - if (RExC_close_parens) \ - SAVEFREEPV(RExC_close_parens); \ - } STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given - * arg. Show regex, up to a maximum length. If it's too long, chop and add - * "...". - */ -#define _FAIL(code) STMT_START { \ - const char *ellipses = ""; \ - IV len = RExC_precomp_end - RExC_precomp; \ - \ - PREPARE_TO_DIE; \ - if (len > RegexLengthToShowInErrorMessages) { \ - /* chop 10 shorter than the max, to ensure meaning of "..." */ \ - len = RegexLengthToShowInErrorMessages - 10; \ - ellipses = "..."; \ - } \ - code; \ -} STMT_END - -#define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ - msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -#define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ - arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -#define FAIL3(msg,arg1,arg2) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ - arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) - -/* - * Simple_vFAIL -- like FAIL, but marks the current location in the scan - */ -#define Simple_vFAIL(m) STMT_START { \ - Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() - */ -#define vFAIL(m) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL(m); \ -} STMT_END - -/* - * Like Simple_vFAIL(), but accepts two arguments. - */ -#define Simple_vFAIL2(m,a1) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). - */ -#define vFAIL2(m,a1) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL2(m, a1); \ -} STMT_END - - -/* - * Like Simple_vFAIL(), but accepts three arguments. - */ -#define Simple_vFAIL3(m, a1, a2) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* - * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). - */ -#define vFAIL3(m,a1,a2) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL3(m, a1, a2); \ -} STMT_END - -/* - * Like Simple_vFAIL(), but accepts four arguments. - */ -#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -#define vFAIL4(m,a1,a2,a3) STMT_START { \ - PREPARE_TO_DIE; \ - Simple_vFAIL4(m, a1, a2, a3); \ -} STMT_END - -/* A specialized version of vFAIL2 that works with UTF8f */ -#define vFAIL2utf8f(m, a1) STMT_START { \ - PREPARE_TO_DIE; \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -#define vFAIL3utf8f(m, a1, a2) STMT_START { \ - PREPARE_TO_DIE; \ - S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(RExC_parse)); \ -} STMT_END - -/* Setting this to NULL is a signal to not output warnings */ -#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ - STMT_START { \ - RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ - RExC_copy_start_in_constructed = NULL; \ - } STMT_END -#define RESTORE_WARNINGS \ - RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed - -/* Since a warning can be generated multiple times as the input is reparsed, we - * output it the first time we come to that point in the parse, but suppress it - * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not - * generate any warnings */ -#define TO_OUTPUT_WARNINGS(loc) \ - ( RExC_copy_start_in_constructed \ - && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) - -/* After we've emitted a warning, we save the position in the input so we don't - * output it again */ -#define UPDATE_WARNINGS_LOC(loc) \ - STMT_START { \ - if (TO_OUTPUT_WARNINGS(loc)) { \ - RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ - - RExC_precomp; \ - } \ - } STMT_END - -/* 'warns' is the output of the packWARNx macro used in 'code' */ -#define _WARN_HELPER(loc, warns, code) \ - STMT_START { \ - if (! RExC_copy_start_in_constructed) { \ - Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ - " expected at '%s'", \ - __FILE__, __LINE__, loc); \ - } \ - if (TO_OUTPUT_WARNINGS(loc)) { \ - if (ckDEAD(warns)) \ - PREPARE_TO_DIE; \ - code; \ - UPDATE_WARNINGS_LOC(loc); \ - } \ - } STMT_END - -/* m is not necessarily a "literal string", in this macro */ -#define warn_non_literal_string(loc, packed_warn, m) \ - _WARN_HELPER(loc, packed_warn, \ - Perl_warner(aTHX_ packed_warn, \ - "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(loc))) -#define reg_warn_non_literal_string(loc, m) \ - warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) - -#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ - STMT_START { \ - char * format; \ - Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ - Newx(format, format_size, char); \ - my_strlcpy(format, m, format_size); \ - my_strlcat(format, REPORT_LOCATION, format_size); \ - SAVEFREEPV(format); \ - _WARN_HELPER(loc, packwarn, \ - Perl_ck_warner(aTHX_ packwarn, \ - format, \ - a1, REPORT_LOCATION_ARGS(loc))); \ - } STMT_END - -#define ckWARNreg(loc,m) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN(loc, m) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) \ - -#define vWARN_dep(loc, m) \ - _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNdep(loc,m) \ - _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNregdep(loc,m) \ - _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ - WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARN2reg_d(loc,m, a1) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(loc))) - -#define ckWARN2reg(loc, m, a1) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(loc))) - -#define vWARN3(loc, m, a1, a2) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(loc))) - -#define ckWARN3reg(loc, m, a1, a2) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN4(loc, m, a1, a2, a3) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARN4reg(loc, m, a1, a2, a3) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, \ - REPORT_LOCATION_ARGS(loc))) - -#define vWARN5(loc, m, a1, a2, a3, a4) \ - _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, a2, a3, a4, \ - REPORT_LOCATION_ARGS(loc))) - -#define ckWARNexperimental(loc, class, m) \ - STMT_START { \ - if (! RExC_warned_ ## class) { /* warn once per compilation */ \ - RExC_warned_ ## class = 1; \ - _WARN_HELPER(loc, packWARN(class), \ - Perl_ck_warner_d(aTHX_ packWARN(class), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(loc)));\ - } \ - } STMT_END - -#define ckWARNexperimental_with_arg(loc, class, m, arg) \ - STMT_START { \ - if (! RExC_warned_ ## class) { /* warn once per compilation */ \ - RExC_warned_ ## class = 1; \ - _WARN_HELPER(loc, packWARN(class), \ - Perl_ck_warner_d(aTHX_ packWARN(class), \ - m REPORT_LOCATION, \ - arg, REPORT_LOCATION_ARGS(loc)));\ - } \ - } STMT_END - -/* Convert between a pointer to a node and its offset from the beginning of the - * program */ -#define REGNODE_p(offset) (RExC_emit_start + (offset)) -#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ - (SSize_t) ((node) - RExC_emit_start)) - -#define ProgLen(ri) ri->proglen -#define SetProgLen(ri,x) ri->proglen = x - -#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS -#define EXPERIMENTAL_INPLACESCAN -#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ - -STATIC void -S_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) -{ - PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST; - - /* As the name says. The zeroth bit corresponds to the code point given by - * 'offset' */ - - UV start, end; - - Zero(bitmap, len, U8); - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - assert(start >= offset); - - for (UV i = start; i <= end; i++) { - UV adjusted = i - offset; - - BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted); - } - } - invlist_iterfinish(invlist); -} - -STATIC void -S_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset) -{ - PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP; - - /* As the name says. The zeroth bit corresponds to the code point given by - * 'offset' */ - - Size_t i; - - for (i = 0; i < bitmap_len; i++) { - if (BITMAP_TEST(bitmap, i)) { - int start = i++; - - /* Save a little work by adding a range all at once instead of bit - * by bit */ - while (i < bitmap_len && BITMAP_TEST(bitmap, i)) { - i++; - } - - *invlist = _add_range_to_invlist(*invlist, - start + offset, - i + offset - 1); - } - } -} - -#ifdef DEBUGGING -int -Perl_re_printf(pTHX_ const char *fmt, ...) -{ - va_list ap; - int result; - PerlIO *f= Perl_debug_log; - PERL_ARGS_ASSERT_RE_PRINTF; - va_start(ap, fmt); - result = PerlIO_vprintf(f, fmt, ap); - va_end(ap); - return result; -} - -int -Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) -{ - va_list ap; - int result; - PerlIO *f= Perl_debug_log; - PERL_ARGS_ASSERT_RE_INDENTF; - va_start(ap, depth); - PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); - result = PerlIO_vprintf(f, fmt, ap); - va_end(ap); - return result; -} -#endif /* DEBUGGING */ - -#define DEBUG_RExC_seen() \ - DEBUG_OPTIMISE_MORE_r({ \ - Perl_re_printf( aTHX_ "RExC_seen: "); \ - \ - if (RExC_seen & REG_ZERO_LEN_SEEN) \ - Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ - \ - if (RExC_seen & REG_LOOKBEHIND_SEEN) \ - Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ - \ - if (RExC_seen & REG_GPOS_SEEN) \ - Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ - \ - if (RExC_seen & REG_RECURSE_SEEN) \ - Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ - \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ - Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ - \ - if (RExC_seen & REG_VERBARG_SEEN) \ - Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ - \ - if (RExC_seen & REG_CUTGROUP_SEEN) \ - Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ - \ - if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ - Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ - \ - if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ - Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ - \ - if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ - Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ - \ - Perl_re_printf( aTHX_ "\n"); \ - }); - -#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ - if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) - - -#ifdef DEBUGGING -static void -S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, - const char *close_str) -{ - if (!flags) - return; - - Perl_re_printf( aTHX_ "%s", open_str); - DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); - DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); - DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); - DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); - DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); - DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); - DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); - Perl_re_printf( aTHX_ "%s", close_str); -} - - -static void -S_debug_studydata(pTHX_ const char *where, scan_data_t *data, - U32 depth, int is_inf, - SSize_t min, SSize_t stopmin, SSize_t delta) -{ - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - DEBUG_OPTIMISE_MORE_r({ - if (!data) - return; - Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, - depth, - where, - min, stopmin, delta, - (IV)data->pos_min, - (IV)data->pos_delta, - (UV)data->flags - ); - - S_debug_show_study_flags(aTHX_ data->flags," [","]"); - - Perl_re_printf( aTHX_ - " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", - (IV)data->whilem_c, - (IV)(data->last_closep ? *((data)->last_closep) : -1), - is_inf ? "INF " : "" - ); - - if (data->last_found) { - int i; - Perl_re_printf(aTHX_ - "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, - SvPVX_const(data->last_found), - (IV)data->last_end, - (IV)data->last_start_min, - (IV)data->last_start_max - ); - - for (i = 0; i < 2; i++) { - Perl_re_printf(aTHX_ - " %s%s: '%s' @ %" IVdf "/%" IVdf, - data->cur_is_floating == i ? "*" : "", - i ? "Float" : "Fixed", - SvPVX_const(data->substrs[i].str), - (IV)data->substrs[i].min_offset, - (IV)data->substrs[i].max_offset - ); - S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); - } - } - - Perl_re_printf( aTHX_ "\n"); - }); -} - - -static void -S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, - regnode *scan, U32 depth, U32 flags) -{ - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - DEBUG_OPTIMISE_r({ - regnode *Next; - - if (!scan) - return; - Next = regnext(scan); - regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", - depth, - str, - REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), - Next ? (REG_NODE_NUM(Next)) : 0 ); - S_debug_show_study_flags(aTHX_ flags," [ ","]"); - Perl_re_printf( aTHX_ "\n"); - }); -} - - -# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ - S_debug_studydata(aTHX_ where, data, depth, is_inf, min, stopmin, delta) - -# define DEBUG_PEEP(str, scan, depth, flags) \ - S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags) - -#else -# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP -# define DEBUG_PEEP(str, scan, depth, flags) NOOP -#endif - +#include "regcomp_internal.h" /* ========================================================= * BEGIN edit_distance stuff. @@ -1631,5313 +290,16 @@ S_edit_distance(const UV* src, /* END of edit_distance() stuff * ========================================================= */ -/* Mark that we cannot extend a found fixed substring at this point. - Update the longest found anchored substring or the longest found - floating substrings if needed. */ - -STATIC void -S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, - SSize_t *minlenp, int is_inf) -{ - const STRLEN l = CHR_SVLEN(data->last_found); - SV * const longest_sv = data->substrs[data->cur_is_floating].str; - const STRLEN old_l = CHR_SVLEN(longest_sv); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_SCAN_COMMIT; - - if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { - const U8 i = data->cur_is_floating; - SvSetMagicSV(longest_sv, data->last_found); - data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; - - if (!i) /* fixed */ - data->substrs[0].max_offset = data->substrs[0].min_offset; - else { /* float */ - data->substrs[1].max_offset = - (is_inf) - ? OPTIMIZE_INFTY - : (l - ? data->last_start_max - : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min - ? OPTIMIZE_INFTY - : data->pos_min + data->pos_delta)); - } - - data->substrs[i].flags &= ~SF_BEFORE_EOL; - data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; - data->substrs[i].minlenp = minlenp; - data->substrs[i].lookbehind = 0; - } - - SvCUR_set(data->last_found, 0); - { - SV * const sv = data->last_found; - if (SvUTF8(sv) && SvMAGICAL(sv)) { - MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); - if (mg) - mg->mg_len = 0; - } - } - data->last_end = -1; - data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1); -} - -/* An SSC is just a regnode_charclass_posix with an extra field: the inversion - * list that describes which code points it matches */ - -STATIC void -S_ssc_anything(pTHX_ regnode_ssc *ssc) -{ - /* Set the SSC 'ssc' to match an empty string or any code point */ - - PERL_ARGS_ASSERT_SSC_ANYTHING; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* mortalize so won't leak */ - ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); - ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ -} - -STATIC int -S_ssc_is_anything(const regnode_ssc *ssc) -{ - /* Returns TRUE if the SSC 'ssc' can match the empty string and any code - * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys - * us anything: if the function returns TRUE, 'ssc' hasn't been restricted - * in any way, so there's no point in using it */ - - UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ - bool ret; - - PERL_ARGS_ASSERT_SSC_IS_ANYTHING; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { - return FALSE; - } - - /* See if the list consists solely of the range 0 - Infinity */ - invlist_iterinit(ssc->invlist); - ret = invlist_iternext(ssc->invlist, &start, &end) - && start == 0 - && end == UV_MAX; - - invlist_iterfinish(ssc->invlist); - - if (ret) { - return TRUE; - } - - /* If e.g., both \w and \W are set, matches everything */ - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - int i; - for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { - if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { - return TRUE; - } - } - } - - return FALSE; -} - -STATIC void -S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) -{ - /* Initializes the SSC 'ssc'. This includes setting it to match an empty - * string, any code point, or any posix class under locale */ - - PERL_ARGS_ASSERT_SSC_INIT; - - Zero(ssc, 1, regnode_ssc); - set_ANYOF_SYNTHETIC(ssc); - ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); - ssc_anything(ssc); - - /* If any portion of the regex is to operate under locale rules that aren't - * fully known at compile time, initialization includes it. The reason - * this isn't done for all regexes is that the optimizer was written under - * the assumption that locale was all-or-nothing. Given the complexity and - * lack of documentation in the optimizer, and that there are inadequate - * test cases for locale, many parts of it may not work properly, it is - * safest to avoid locale unless necessary. */ - if (RExC_contains_locale) { - ANYOF_POSIXL_SETALL(ssc); - } - else { - ANYOF_POSIXL_ZERO(ssc); - } -} - -STATIC int -S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, - const regnode_ssc *ssc) -{ - /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only - * to the list of code points matched, and locale posix classes; hence does - * not check its flags) */ - - UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ - bool ret; - - PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - invlist_iterinit(ssc->invlist); - ret = invlist_iternext(ssc->invlist, &start, &end) - && start == 0 - && end == UV_MAX; - - invlist_iterfinish(ssc->invlist); - - if (! ret) { - return FALSE; - } - - if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { - return FALSE; - } - - return TRUE; -} - -#define INVLIST_INDEX 0 -#define ONLY_LOCALE_MATCHES_INDEX 1 -#define DEFERRED_USER_DEFINED_INDEX 2 - -STATIC SV* -S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass* const node) -{ - /* Returns a mortal inversion list defining which code points are matched - * by 'node', which is of ANYOF-ish type . Handles complementing the - * result if appropriate. If some code points aren't knowable at this - * time, the returned list must, and will, contain every code point that is - * a possibility. */ - - SV* invlist = NULL; - SV* only_utf8_locale_invlist = NULL; - bool new_node_has_latin1 = FALSE; - const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF) - ? ANYOF_FLAGS(node) - : 0; - - PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; - - /* Look at the data structure created by S_set_ANYOF_arg() */ - if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { - invlist = sv_2mortal(_new_invlist(1)); - invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); - } - else if (ANYOF_HAS_AUX(node)) { - const U32 n = ARG(node); - SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); - AV * const av = MUTABLE_AV(SvRV(rv)); - SV **const ary = AvARRAY(av); - - if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { - - /* Here there are things that won't be known until runtime -- we - * have to assume it could be anything */ - invlist = sv_2mortal(_new_invlist(1)); - return _add_range_to_invlist(invlist, 0, UV_MAX); - } - else if (ary[INVLIST_INDEX]) { - - /* Use the node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); - } - - /* Get the code points valid only under UTF-8 locales */ - if ( (flags & ANYOFL_FOLD) - && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) - { - only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; - } - } - - if (! invlist) { - invlist = sv_2mortal(_new_invlist(0)); - } - - /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS - * code points, and an inversion list for the others, but if there are code - * points that should match only conditionally on the target string being - * UTF-8, those are placed in the inversion list, and not the bitmap. - * Since there are circumstances under which they could match, they are - * included in the SSC. But if the ANYOF node is to be inverted, we have - * to exclude them here, so that when we invert below, the end result - * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We - * have to do this here before we add the unconditionally matched code - * points */ - if (flags & ANYOF_INVERT) { - _invlist_intersection_complement_2nd(invlist, - PL_UpperLatin1, - &invlist); - } - - /* Add in the points from the bit map */ - if (REGNODE_TYPE(OP(node)) == ANYOF){ - for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (ANYOF_BITMAP_TEST(node, i)) { - unsigned int start = i++; - - for (; i < NUM_ANYOF_CODE_POINTS - && ANYOF_BITMAP_TEST(node, i); ++i) - { - /* empty */ - } - invlist = _add_range_to_invlist(invlist, start, i-1); - new_node_has_latin1 = TRUE; - } - } - } - - /* If this can match all upper Latin1 code points, have to add them - * as well. But don't add them if inverting, as when that gets done below, - * it would exclude all these characters, including the ones it shouldn't - * that were added just above */ - if ( ! (flags & ANYOF_INVERT) - && OP(node) == ANYOFD - && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) - { - _invlist_union(invlist, PL_UpperLatin1, &invlist); - } - - /* Similarly for these */ - if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { - _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); - } - - if (flags & ANYOF_INVERT) { - _invlist_invert(invlist); - } - else if (flags & ANYOFL_FOLD) { - if (new_node_has_latin1) { - - /* These folds are potential in Turkic locales */ - if (_invlist_contains_cp(invlist, 'i')) { - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - if (_invlist_contains_cp(invlist, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - } - - /* Under /li, any 0-255 could fold to any other 0-255, depending on - * the locale. We can skip this if there are no 0-255 at all. */ - _invlist_union(invlist, PL_Latin1, &invlist); - } - else { - if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { - invlist = add_cp_to_invlist(invlist, 'I'); - } - if (_invlist_contains_cp(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) - { - invlist = add_cp_to_invlist(invlist, 'i'); - } - } - } - - /* Similarly add the UTF-8 locale possible matches. These have to be - * deferred until after the non-UTF-8 locale ones are taken care of just - * above, or it leads to wrong results under ANYOF_INVERT */ - if (only_utf8_locale_invlist) { - _invlist_union_maybe_complement_2nd(invlist, - only_utf8_locale_invlist, - flags & ANYOF_INVERT, - &invlist); - } - - return invlist; -} - -/* These two functions currently do the exact same thing */ -#define ssc_init_zero ssc_init - -#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) -#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) - -/* 'AND' a given class with another one. Can create false positives. 'ssc' - * should not be inverted. */ - -STATIC void -S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_charclass *and_with) -{ - /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either - * another SSC or a regular ANYOF class. Can create false positives. */ - - SV* anded_cp_list; - U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF) - ? ANYOF_FLAGS(and_with) - : 0; - U8 anded_flags; - - PERL_ARGS_ASSERT_SSC_AND; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract - * the code point inversion list and just the relevant flags */ - if (is_ANYOF_SYNTHETIC(and_with)) { - anded_cp_list = ((regnode_ssc *)and_with)->invlist; - anded_flags = and_with_flags; - - /* XXX This is a kludge around what appears to be deficiencies in the - * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, - * there are paths through the optimizer where it doesn't get weeded - * out when it should. And if we don't make some extra provision for - * it like the code just below, it doesn't get added when it should. - * This solution is to add it only when AND'ing, which is here, and - * only when what is being AND'ed is the pristine, original node - * matching anything. Thus it is like adding it to ssc_anything() but - * only when the result is to be AND'ed. Probably the same solution - * could be adopted for the same problem we have with /l matching, - * which is solved differently in S_ssc_init(), and that would lead to - * fewer false positives than that solution has. But if this solution - * creates bugs, the consequences are only that a warning isn't raised - * that should be; while the consequences for having /l bugs is - * incorrect matches */ - if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER__shared; - } - } - else { - anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - if (OP(and_with) == ANYOFD) { - anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; - } - else { - anded_flags = and_with_flags - & ( ANYOF_COMMON_FLAGS - |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); - if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) { - anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; - } - } - } - - ANYOF_FLAGS(ssc) &= anded_flags; - - /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. - * C2 is the list of code points in 'and-with'; P2, its posix classes. - * 'and_with' may be inverted. When not inverted, we have the situation of - * computing: - * (C1 | P1) & (C2 | P2) - * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) - * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) - * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) - * <= ((C1 & C2) | P1 | P2) - * Alternatively, the last few steps could be: - * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) - * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) - * <= (C1 | C2 | (P1 & P2)) - * We favor the second approach if either P1 or P2 is non-empty. This is - * because these components are a barrier to doing optimizations, as what - * they match cannot be known until the moment of matching as they are - * dependent on the current locale, 'AND"ing them likely will reduce or - * eliminate them. - * But we can do better if we know that C1,P1 are in their initial state (a - * frequent occurrence), each matching everything: - * (<everything>) & (C2 | P2) = C2 | P2 - * Similarly, if C2,P2 are in their initial state (again a frequent - * occurrence), the result is a no-op - * (C1 | P1) & (<everything>) = C1 | P1 - * - * Inverted, we have - * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) - * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) - * <= (C1 & ~C2) | (P1 & ~P2) - * */ - - if ((and_with_flags & ANYOF_INVERT) - && ! is_ANYOF_SYNTHETIC(and_with)) - { - unsigned int i; - - ssc_intersection(ssc, - anded_cp_list, - FALSE /* Has already been inverted */ - ); - - /* If either P1 or P2 is empty, the intersection will be also; can skip - * the loop */ - if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { - ANYOF_POSIXL_ZERO(ssc); - } - else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - - /* Note that the Posix class component P from 'and_with' actually - * looks like: - * P = Pa | Pb | ... | Pn - * where each component is one posix class, such as in [\w\s]. - * Thus - * ~P = ~(Pa | Pb | ... | Pn) - * = ~Pa & ~Pb & ... & ~Pn - * <= ~Pa | ~Pb | ... | ~Pn - * The last is something we can easily calculate, but unfortunately - * is likely to have many false positives. We could do better - * in some (but certainly not all) instances if two classes in - * P have known relationships. For example - * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: - * So - * :lower: & :print: = :lower: - * And similarly for classes that must be disjoint. For example, - * since \s and \w can have no elements in common based on rules in - * the POSIX standard, - * \w & ^\S = nothing - * Unfortunately, some vendor locales do not meet the Posix - * standard, in particular almost everything by Microsoft. - * The loop below just changes e.g., \w into \W and vice versa */ - - regnode_charclass_posixl temp; - int add = 1; /* To calculate the index of the complement */ - - Zero(&temp, 1, regnode_charclass_posixl); - ANYOF_POSIXL_ZERO(&temp); - for (i = 0; i < ANYOF_MAX; i++) { - assert(i % 2 != 0 - || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) - || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); - - if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { - ANYOF_POSIXL_SET(&temp, i + add); - } - add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ - } - ANYOF_POSIXL_AND(&temp, ssc); - - } /* else ssc already has no posixes */ - } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC - in its initial state */ - else if (! is_ANYOF_SYNTHETIC(and_with) - || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) - { - /* But if 'ssc' is in its initial state, the result is just 'and_with'; - * copy it over 'ssc' */ - if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { - if (is_ANYOF_SYNTHETIC(and_with)) { - StructCopy(and_with, ssc, regnode_ssc); - } - else { - ssc->invlist = anded_cp_list; - ANYOF_POSIXL_ZERO(ssc); - if (and_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); - } - } - } - else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) - || (and_with_flags & ANYOF_MATCHES_POSIXL)) - { - /* One or the other of P1, P2 is non-empty. */ - if (and_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); - } - ssc_union(ssc, anded_cp_list, FALSE); - } - else { /* P1 = P2 = empty */ - ssc_intersection(ssc, anded_cp_list, FALSE); - } - } -} - -STATIC void -S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, - const regnode_charclass *or_with) -{ - /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either - * another SSC or a regular ANYOF class. Can create false positives if - * 'or_with' is to be inverted. */ - - SV* ored_cp_list; - U8 ored_flags; - U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF) - ? ANYOF_FLAGS(or_with) - : 0; - - PERL_ARGS_ASSERT_SSC_OR; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract - * the code point inversion list and just the relevant flags */ - if (is_ANYOF_SYNTHETIC(or_with)) { - ored_cp_list = ((regnode_ssc*) or_with)->invlist; - ored_flags = or_with_flags; - } - else { - ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); - ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; - if (OP(or_with) != ANYOFD) { - ored_flags |= - or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); - if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) { - ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; - } - } - } - - ANYOF_FLAGS(ssc) |= ored_flags; - - /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. - * C2 is the list of code points in 'or-with'; P2, its posix classes. - * 'or_with' may be inverted. When not inverted, we have the simple - * situation of computing: - * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) - * If P1|P2 yields a situation with both a class and its complement are - * set, like having both \w and \W, this matches all code points, and we - * can delete these from the P component of the ssc going forward. XXX We - * might be able to delete all the P components, but I (khw) am not certain - * about this, and it is better to be safe. - * - * Inverted, we have - * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) - * <= (C1 | P1) | ~C2 - * <= (C1 | ~C2) | P1 - * (which results in actually simpler code than the non-inverted case) - * */ - - if ((or_with_flags & ANYOF_INVERT) - && ! is_ANYOF_SYNTHETIC(or_with)) - { - /* We ignore P2, leaving P1 going forward */ - } /* else Not inverted */ - else if (or_with_flags & ANYOF_MATCHES_POSIXL) { - ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - unsigned int i; - for (i = 0; i < ANYOF_MAX; i += 2) { - if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) - { - ssc_match_all_cp(ssc); - ANYOF_POSIXL_CLEAR(ssc, i); - ANYOF_POSIXL_CLEAR(ssc, i+1); - } - } - } - } - - ssc_union(ssc, - ored_cp_list, - FALSE /* Already has been inverted */ - ); -} - -STATIC void -S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) -{ - PERL_ARGS_ASSERT_SSC_UNION; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - _invlist_union_maybe_complement_2nd(ssc->invlist, - invlist, - invert2nd, - &ssc->invlist); -} - -STATIC void -S_ssc_intersection(pTHX_ regnode_ssc *ssc, - SV* const invlist, - const bool invert2nd) -{ - PERL_ARGS_ASSERT_SSC_INTERSECTION; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - _invlist_intersection_maybe_complement_2nd(ssc->invlist, - invlist, - invert2nd, - &ssc->invlist); -} - -STATIC void -S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) -{ - PERL_ARGS_ASSERT_SSC_ADD_RANGE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); -} - -STATIC void -S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) -{ - /* AND just the single code point 'cp' into the SSC 'ssc' */ - - SV* cp_list = _new_invlist(2); - - PERL_ARGS_ASSERT_SSC_CP_AND; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - cp_list = add_cp_to_invlist(cp_list, cp); - ssc_intersection(ssc, cp_list, - FALSE /* Not inverted */ - ); - SvREFCNT_dec_NN(cp_list); -} - -STATIC void -S_ssc_clear_locale(regnode_ssc *ssc) -{ - /* Set the SSC 'ssc' to not match any locale things */ - PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - ANYOF_POSIXL_ZERO(ssc); - ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; -} - -STATIC bool -S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) -{ - /* The synthetic start class is used to hopefully quickly winnow down - * places where a pattern could start a match in the target string. If it - * doesn't really narrow things down that much, there isn't much point to - * having the overhead of using it. This function uses some very crude - * heuristics to decide if to use the ssc or not. - * - * It returns TRUE if 'ssc' rules out more than half what it considers to - * be the "likely" possible matches, but of course it doesn't know what the - * actual things being matched are going to be; these are only guesses - * - * For /l matches, it assumes that the only likely matches are going to be - * in the 0-255 range, uniformly distributed, so half of that is 127 - * For /a and /d matches, it assumes that the likely matches will be just - * the ASCII range, so half of that is 63 - * For /u and there isn't anything matching above the Latin1 range, it - * assumes that that is the only range likely to be matched, and uses - * half that as the cut-off: 127. If anything matches above Latin1, - * it assumes that all of Unicode could match (uniformly), except for - * non-Unicode code points and things in the General Category "Other" - * (unassigned, private use, surrogates, controls and formats). This - * is a much large number. */ - - U32 count = 0; /* Running total of number of code points matched by - 'ssc' */ - UV start, end; /* Start and end points of current range in inversion - XXX outdated. UTF-8 locales are common, what about invert? list */ - const U32 max_code_points = (LOC) - ? 256 - : (( ! UNI_SEMANTICS - || invlist_highest(ssc->invlist) < 256) - ? 128 - : NON_OTHER_COUNT); - const U32 max_match = max_code_points / 2; - - PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; - - invlist_iterinit(ssc->invlist); - while (invlist_iternext(ssc->invlist, &start, &end)) { - if (start >= max_code_points) { - break; - } - end = MIN(end, max_code_points - 1); - count += end - start + 1; - if (count >= max_match) { - invlist_iterfinish(ssc->invlist); - return FALSE; - } - } - - return TRUE; -} - - -STATIC void -S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) -{ - /* The inversion list in the SSC is marked mortal; now we need a more - * permanent copy, which is stored the same way that is done in a regular - * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit - * map */ - - SV* invlist = invlist_clone(ssc->invlist, NULL); - - PERL_ARGS_ASSERT_SSC_FINALIZE; - - assert(is_ANYOF_SYNTHETIC(ssc)); - - /* The code in this file assumes that all but these flags aren't relevant - * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared - * by the time we reach here */ - assert(! (ANYOF_FLAGS(ssc) - & ~( ANYOF_COMMON_FLAGS - |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared - |ANYOF_HAS_EXTRA_RUNTIME_MATCHES))); - - populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist); - - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); - SvREFCNT_dec(invlist); - - /* Make sure is clone-safe */ - ssc->invlist = NULL; - - if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; - OP(ssc) = ANYOFPOSIXL; - } - else if (RExC_contains_locale) { - OP(ssc) = ANYOFL; - } - - assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); -} - -#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] -#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) -#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) -#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ - ? (TRIE_LIST_CUR( idx ) - 1) \ - : 0 ) - - -#ifdef DEBUGGING -/* - dump_trie(trie,widecharmap,revcharmap) - dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) - dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) - - These routines dump out a trie in a somewhat readable format. - The _interim_ variants are used for debugging the interim - tables that are used to generate the final compressed - representation which is what dump_trie expects. - - Part of the reason for their existence is to provide a form - of documentation as to how the different representations function. - -*/ - -/* - Dumps the final compressed table form of the trie to Perl_debug_log. - Used for debugging make_trie(). -*/ - -STATIC void -S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, - AV *revcharmap, U32 depth) -{ - U32 state; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - U16 word; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE; - - Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", - depth+1, "Match","Base","Ofs" ); - - for( state = 0 ; state < trie->uniquecharcount ; state++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, state, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - } - } - Perl_re_printf( aTHX_ "\n"); - Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); - - for( state = 0 ; state < trie->uniquecharcount ; state++ ) - Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); - Perl_re_printf( aTHX_ "\n"); - - for( state = 1 ; state < trie->statecount ; state++ ) { - const U32 base = trie->states[ state ].trans.base; - - Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); - - if ( trie->states[ state ].wordnum ) { - Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); - } else { - Perl_re_printf( aTHX_ "%6s", "" ); - } - - Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); - - if ( base ) { - U32 ofs = 0; - - while( ( base + ofs < trie->uniquecharcount ) || - ( base + ofs - trie->uniquecharcount < trie->lasttrans - && trie->trans[ base + ofs - trie->uniquecharcount ].check - != state)) - ofs++; - - Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); - - for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) - && ( base + ofs - trie->uniquecharcount - < trie->lasttrans ) - && trie->trans[ base + ofs - - trie->uniquecharcount ].check == state ) - { - Perl_re_printf( aTHX_ "%*" UVXf, colwidth, - (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next - ); - } else { - Perl_re_printf( aTHX_ "%*s", colwidth," ." ); - } - } - - Perl_re_printf( aTHX_ "]"); - - } - Perl_re_printf( aTHX_ "\n" ); - } - Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", - depth); - for (word=1; word <= trie->wordcount; word++) { - Perl_re_printf( aTHX_ " %d:(%d,%d)", - (int)word, (int)(trie->wordinfo[word].prev), - (int)(trie->wordinfo[word].len)); - } - Perl_re_printf( aTHX_ "\n" ); -} -/* - Dumps a fully constructed but uncompressed trie in list form. - List tries normally only are used for construction when the number of - possible chars (trie->uniquecharcount) is very high. - Used for debugging make_trie(). -*/ -STATIC void -S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, - HV *widecharmap, AV *revcharmap, U32 next_alloc, - U32 depth) -{ - U32 state; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; - - /* print out the table precompression. */ - Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", - depth+1 ); - Perl_re_indentf( aTHX_ "%s", - depth+1, "------:-----+-----------------\n" ); - - for( state=1 ; state < next_alloc ; state ++ ) { - U16 charid; - - Perl_re_indentf( aTHX_ " %4" UVXf " :", - depth+1, (UV)state ); - if ( ! trie->states[ state ].wordnum ) { - Perl_re_printf( aTHX_ "%5s| ",""); - } else { - Perl_re_printf( aTHX_ "W%4x| ", - trie->states[ state ].wordnum - ); - } - for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, - TRIE_LIST_ITEM(state, charid).forid, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), - colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) - | PERL_PV_ESCAPE_FIRSTCHAR - ) , - TRIE_LIST_ITEM(state, charid).forid, - (UV)TRIE_LIST_ITEM(state, charid).newstate - ); - if (!(charid % 10)) - Perl_re_printf( aTHX_ "\n%*s| ", - (int)((depth * 2) + 14), ""); - } - } - Perl_re_printf( aTHX_ "\n"); - } -} - -/* - Dumps a fully constructed but uncompressed trie in table form. - This is the normal DFA style state transition table, with a few - twists to facilitate compression later. - Used for debugging make_trie(). -*/ -STATIC void -S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, - HV *widecharmap, AV *revcharmap, U32 next_alloc, - U32 depth) -{ - U32 state; - U16 charid; - SV *sv=sv_newmortal(); - int colwidth= widecharmap ? 6 : 4; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; - - /* - print out the table precompression so that we can do a visual check - that they are identical. - */ - - Perl_re_indentf( aTHX_ "Char : ", depth+1 ); - - for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { - SV ** const tmp = av_fetch_simple( revcharmap, charid, 0); - if ( tmp ) { - Perl_re_printf( aTHX_ "%*s", - colwidth, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - } - } - - Perl_re_printf( aTHX_ "\n"); - Perl_re_indentf( aTHX_ "State+-", depth+1 ); - - for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { - Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); - } - - Perl_re_printf( aTHX_ "\n" ); - - for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - - Perl_re_indentf( aTHX_ "%4" UVXf " : ", - depth+1, - (UV)TRIE_NODENUM( state ) ); - - for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { - UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); - if (v) - Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); - else - Perl_re_printf( aTHX_ "%*s", colwidth, "." ); - } - if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - Perl_re_printf( aTHX_ " (%4" UVXf ")\n", - (UV)trie->trans[ state ].check ); - } else { - Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", - (UV)trie->trans[ state ].check, - trie->states[ TRIE_NODENUM( state ) ].wordnum ); - } - } -} - -#endif - - -/* make_trie(startbranch,first,last,tail,word_count,flags,depth) - startbranch: the first branch in the whole branch sequence - first : start branch of sequence of branch-exact nodes. - May be the same as startbranch - last : Thing following the last branch. - May be the same as tail. - tail : item following the branch sequence - count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ - depth : indent depth - -Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. - -A trie is an N'ary tree where the branches are determined by digital -decomposition of the key. IE, at the root node you look up the 1st character and -follow that branch repeat until you find the end of the branches. Nodes can be -marked as "accepting" meaning they represent a complete word. Eg: - - /he|she|his|hers/ - -would convert into the following structure. Numbers represent states, letters -following numbers represent valid transitions on the letter from that state, if -the number is in square brackets it represents an accepting state, otherwise it -will be in parenthesis. - - +-h->+-e->[3]-+-r->(8)-+-s->[9] - | | - | (2) - | | - (1) +-i->(6)-+-s->[7] - | - +-s->(3)-+-h->(4)-+-e->[5] - - Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) - -This shows that when matching against the string 'hers' we will begin at state 1 -read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, -then read 'r' and go to state 8 followed by 's' which takes us to state 9 which -is also accepting. Thus we know that we can match both 'he' and 'hers' with a -single traverse. We store a mapping from accepting to state to which word was -matched, and then when we have multiple possibilities we try to complete the -rest of the regex in the order in which they occurred in the alternation. - -The only prior NFA like behaviour that would be changed by the TRIE support is -the silent ignoring of duplicate alternations which are of the form: - - / (DUPE|DUPE) X? (?{ ... }) Y /x - -Thus EVAL blocks following a trie may be called a different number of times with -and without the optimisation. With the optimisations dupes will be silently -ignored. This inconsistent behaviour of EVAL type nodes is well established as -the following demonstrates: - - 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ - -which prints out 'word' three times, but - - 'words'=~/(word|word|word)(?{ print $1 })S/ - -which doesnt print it out at all. This is due to other optimisations kicking in. - -Example of what happens on a structural level: - -The regexp /(ac|ad|ab)+/ will produce the following debug output: - - 1: CURLYM[1] {1,32767}(18) - 5: BRANCH(8) - 6: EXACT <ac>(16) - 8: BRANCH(11) - 9: EXACT <ad>(16) - 11: BRANCH(14) - 12: EXACT <ab>(16) - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -This would be optimizable with startbranch=5, first=5, last=16, tail=16 -and should turn into: - - 1: CURLYM[1] {1,32767}(18) - 5: TRIE(16) - [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] - <ac> - <ad> - <ab> - 16: SUCCEED(0) - 17: NOTHING(18) - 18: END(0) - -Cases where tail != last would be like /(?foo|bar)baz/: - - 1: BRANCH(4) - 2: EXACT <foo>(8) - 4: BRANCH(7) - 5: EXACT <bar>(8) - 7: TAIL(8) - 8: EXACT <baz>(10) - 10: END(0) - -which would be optimizable with startbranch=1, first=1, last=7, tail=8 -and would end up looking like: - - 1: TRIE(8) - [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] - <foo> - <bar> - 7: TAIL(8) - 8: EXACT <baz>(10) - 10: END(0) - - d = uvchr_to_utf8_flags(d, uv, 0); - -is the recommended Unicode-aware way of saying - - *(d++) = uv; -*/ - -#define TRIE_STORE_REVCHAR(val) \ - STMT_START { \ - if (UTF) { \ - SV *zlopp = newSV(UTF8_MAXBYTES); \ - unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ - *kapow = '\0'; \ - SvCUR_set(zlopp, kapow - flrbbbbb); \ - SvPOK_on(zlopp); \ - SvUTF8_on(zlopp); \ - av_push_simple(revcharmap, zlopp); \ - } else { \ - char ooooff = (char)val; \ - av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \ - } \ - } STMT_END - -/* This gets the next character from the input, folding it if not already - * folded. */ -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - /* if it is UTF then it is either already folded, or does not need \ - * folding */ \ - uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ - } \ - else if (folder == PL_fold_latin1) { \ - /* This folder implies Unicode rules, which in the range expressible \ - * by not UTF is the lower case, with the two exceptions, one of \ - * which should have been taken care of before calling this */ \ - assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ - uvc = toLOWER_L1(*uc); \ - if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ - len = 1; \ - } else { \ - /* raw data, will be folded later if needed */ \ - uvc = (U32)*uc; \ - len = 1; \ - } \ -} STMT_END - - - -#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ - if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ - U32 ging = TRIE_LIST_LEN( state ) * 2; \ - Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ - TRIE_LIST_LEN( state ) = ging; \ - } \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ - TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ - TRIE_LIST_CUR( state )++; \ -} STMT_END - -#define TRIE_LIST_NEW(state) STMT_START { \ - Newx( trie->states[ state ].trans.list, \ - 4, reg_trie_trans_le ); \ - TRIE_LIST_CUR( state ) = 1; \ - TRIE_LIST_LEN( state ) = 4; \ -} STMT_END - -#define TRIE_HANDLE_WORD(state) STMT_START { \ - U16 dupe= trie->states[ state ].wordnum; \ - regnode * const noper_next = regnext( noper ); \ - \ - DEBUG_r({ \ - /* store the word for dumping */ \ - SV* tmp; \ - if (OP(noper) != NOTHING) \ - tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ - else \ - tmp = newSVpvn_utf8( "", 0, UTF ); \ - av_push_simple( trie_words, tmp ); \ - }); \ - \ - curword++; \ - trie->wordinfo[curword].prev = 0; \ - trie->wordinfo[curword].len = wordlen; \ - trie->wordinfo[curword].accept = state; \ - \ - if ( noper_next < tail ) { \ - if (!trie->jump) \ - trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ - sizeof(U16) ); \ - trie->jump[curword] = (U16)(noper_next - convert); \ - if (!jumper) \ - jumper = noper_next; \ - if (!nextbranch) \ - nextbranch= regnext(cur); \ - } \ - \ - if ( dupe ) { \ - /* It's a dupe. Pre-insert into the wordinfo[].prev */\ - /* chain, so that when the bits of chain are later */\ - /* linked together, the dups appear in the chain */\ - trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ - trie->wordinfo[dupe].prev = curword; \ - } else { \ - /* we haven't inserted this word yet. */ \ - trie->states[ state ].wordnum = curword; \ - } \ -} STMT_END - - -#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ - ( ( base + charid >= ucharcount \ - && base + charid < ubound \ - && state == trie->trans[ base - ucharcount + charid ].check \ - && trie->trans[ base - ucharcount + charid ].next ) \ - ? trie->trans[ base - ucharcount + charid ].next \ - : ( state==1 ? special : 0 ) \ - ) - -#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ -STMT_START { \ - TRIE_BITMAP_SET(trie, uvc); \ - /* store the folded codepoint */ \ - if ( folder ) \ - TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ - \ - if ( !UTF ) { \ - /* store first byte of utf8 representation of */ \ - /* variant codepoints */ \ - if (! UVCHR_IS_INVARIANT(uvc)) { \ - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ - } \ - } \ -} STMT_END -#define MADE_TRIE 1 -#define MADE_JUMP_TRIE 2 -#define MADE_EXACT_TRIE 4 - -STATIC I32 -S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, - regnode *first, regnode *last, regnode *tail, - U32 word_count, U32 flags, U32 depth) -{ - /* first pass, loop through and scan words */ - reg_trie_data *trie; - HV *widecharmap = NULL; - AV *revcharmap = newAV(); - regnode *cur; - STRLEN len = 0; - UV uvc = 0; - U16 curword = 0; - U32 next_alloc = 0; - regnode *jumper = NULL; - regnode *nextbranch = NULL; - regnode *convert = NULL; - U32 *prev_states; /* temp array mapping each state to previous one */ - /* we just use folder as a flag in utf8 */ - const U8 * folder = NULL; - - /* in the below add_data call we are storing either 'tu' or 'tuaa' - * which stands for one trie structure, one hash, optionally followed - * by two arrays */ -#ifdef DEBUGGING - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa")); - AV *trie_words = NULL; - /* along with revcharmap, this only used during construction but both are - * useful during debugging so we store them in the struct when debugging. - */ -#else - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu")); - STRLEN trie_charcount=0; -#endif - SV *re_trie_maxbuff; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_MAKE_TRIE; -#ifndef DEBUGGING - PERL_UNUSED_ARG(depth); -#endif - - switch (flags) { - case EXACT: case EXACT_REQ8: case EXACTL: break; - case EXACTFAA: - case EXACTFUP: - case EXACTFU: - case EXACTFLU8: folder = PL_fold_latin1; break; - case EXACTF: folder = PL_fold; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); - } - - trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); - trie->refcount = 1; - trie->startstate = 1; - trie->wordcount = word_count; - RExC_rxi->data->data[ data_slot ] = (void*)trie; - trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) - trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); - trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( - trie->wordcount+1, sizeof(reg_trie_wordinfo)); - - DEBUG_r({ - trie_words = newAV(); - }); - - re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); - assert(re_trie_maxbuff); - if (!SvIOK(re_trie_maxbuff)) { - sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); - } - DEBUG_TRIE_COMPILE_r({ - Perl_re_indentf( aTHX_ - "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - depth+1, - REG_NODE_NUM(startbranch), REG_NODE_NUM(first), - REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); - }); - - /* Find the node we are going to overwrite */ - if ( first == startbranch && OP( last ) != BRANCH ) { - /* whole branch chain */ - convert = first; - } else { - /* branch sub-chain */ - convert = REGNODE_AFTER( first ); - } - - /* -- First loop and Setup -- - - We first traverse the branches and scan each word to determine if it - contains widechars, and how many unique chars there are, this is - important as we have to build a table with at least as many columns as we - have unique chars. - - We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store Unicode characters. We use - the native representation of the character value as the key and IV's for - the coded index. - - *TODO* If we keep track of how many times each character is used we can - remap the columns so that the table compression later on is more - efficient in terms of memory by ensuring the most common value is in the - middle and the least common are on the outside. IMO this would be better - than a most to least common mapping as theres a decent chance the most - common letter will share a node with the least common, meaning the node - will not be compressible. With a middle is most common approach the worst - case is when we have the least common nodes twice. - - */ - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode *noper = REGNODE_AFTER( cur ); - const U8 *uc; - const U8 *e; - int foldlen = 0; - U32 wordlen = 0; /* required init */ - STRLEN minchars = 0; - STRLEN maxchars = 0; - bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the - bitmap?*/ - - if (OP(noper) == NOTHING) { - /* skip past a NOTHING at the start of an alternation - * eg, /(?:)a|(?:b)/ should be the same as /a|b/ - * - * If the next node is not something we are supposed to process - * we will just ignore it due to the condition guarding the - * next block. - */ - - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } else { - trie->minlen= 0; - continue; - } - - - if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ - TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte - regardless of encoding */ - if (OP( noper ) == EXACTFUP) { - /* false positives are ok, so just set this */ - TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); - } - } - - for ( ; uc < e ; uc += len ) { /* Look at each char in the current - branch */ - TRIE_CHARCOUNT(trie)++; - TRIE_READ_CHAR; - - /* TRIE_READ_CHAR returns the current character, or its fold if /i - * is in effect. Under /i, this character can match itself, or - * anything that folds to it. If not under /i, it can match just - * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN - * all fold to k, and all are single characters. But some folds - * expand to more than one character, so for example LATIN SMALL - * LIGATURE FFI folds to the three character sequence 'ffi'. If - * the string beginning at 'uc' is 'ffi', it could be matched by - * three characters, or just by the one ligature character. (It - * could also be matched by two characters: LATIN SMALL LIGATURE FF - * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). - * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also - * match.) The trie needs to know the minimum and maximum number - * of characters that could match so that it can use size alone to - * quickly reject many match attempts. The max is simple: it is - * the number of folded characters in this branch (since a fold is - * never shorter than what folds to it. */ - - maxchars++; - - /* And the min is equal to the max if not under /i (indicated by - * 'folder' being NULL), or there are no multi-character folds. If - * there is a multi-character fold, the min is incremented just - * once, for the character that folds to the sequence. Each - * character in the sequence needs to be added to the list below of - * characters in the trie, but we count only the first towards the - * min number of characters needed. This is done through the - * variable 'foldlen', which is returned by the macros that look - * for these sequences as the number of bytes the sequence - * occupies. Each time through the loop, we decrement 'foldlen' by - * how many bytes the current char occupies. Only when it reaches - * 0 do we increment 'minchars' or look for another multi-character - * sequence. */ - if (folder == NULL) { - minchars++; - } - else if (foldlen > 0) { - foldlen -= (UTF) ? UTF8SKIP(uc) : 1; - } - else { - minchars++; - - /* See if *uc is the beginning of a multi-character fold. If - * so, we decrement the length remaining to look at, to account - * for the current character this iteration. (We can use 'uc' - * instead of the fold returned by TRIE_READ_CHAR because the - * macro is smart enough to account for any unfolded - * characters. */ - if (UTF) { - if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { - foldlen -= UTF8SKIP(uc); - } - } - else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { - foldlen--; - } - } - - /* The current character (and any potential folds) should be added - * to the possible matching characters for this position in this - * branch */ - if ( uvc < 256 ) { - if ( folder ) { - U8 folded= folder[ (U8) uvc ]; - if ( !trie->charmap[ folded ] ) { - trie->charmap[ folded ]=( ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR( folded ); - } - } - if ( !trie->charmap[ uvc ] ) { - trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR( uvc ); - } - if ( set_bit ) { - /* store the codepoint in the bitmap, and its folded - * equivalent. */ - TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); - set_bit = 0; /* We've done our bit :-) */ - } - } else { - - /* XXX We could come up with the list of code points that fold - * to this using PL_utf8_foldclosures, except not for - * multi-char folds, as there may be multiple combinations - * there that could work, which needs to wait until runtime to - * resolve (The comment about LIGATURE FFI above is such an - * example */ - - SV** svpp; - if ( !widecharmap ) - widecharmap = newHV(); - - svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); - - if ( !svpp ) - Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); - - if ( !SvTRUE( *svpp ) ) { - sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR(uvc); - } - } - } /* end loop through characters in this branch of the trie */ - - /* We take the min and max for this branch and combine to find the min - * and max for all branches processed so far */ - if( cur == first ) { - trie->minlen = minchars; - trie->maxlen = maxchars; - } else if (minchars < trie->minlen) { - trie->minlen = minchars; - } else if (maxchars > trie->maxlen) { - trie->maxlen = maxchars; - } - } /* end first pass */ - DEBUG_TRIE_COMPILE_r( - Perl_re_indentf( aTHX_ - "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", - depth+1, - ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, - (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, - (int)trie->minlen, (int)trie->maxlen ) - ); - - /* - We now know what we are dealing with in terms of unique chars and - string sizes so we can calculate how much memory a naive - representation using a flat table will take. If it's over a reasonable - limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory - conservative but potentially much slower representation using an array - of lists. - - At the end we convert both representations into the same compressed - form that will be used in regexec.c for matching with. The latter - is a form that cannot be used to construct with but has memory - properties similar to the list form and access properties similar - to the table form making it both suitable for fast searches and - small enough that its feasable to store for the duration of a program. - - See the comment in the code where the compressed table is produced - inplace from the flat tabe representation for an explanation of how - the compression works. - - */ - - - Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); - prev_states[1] = 0; - - if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) - > SvIV(re_trie_maxbuff) ) - { - /* - Second Pass -- Array Of Lists Representation - - Each state will be represented by a list of charid:state records - (reg_trie_trans_le) the first such element holds the CUR and LEN - points of the allocated array. (See defines above). - - We build the initial structure using the lists, and then convert - it into the compressed table form which allows faster lookups - (but cant be modified once converted). - */ - - STRLEN transcount = 1; - - DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", - depth+1)); - - trie->states = (reg_trie_state *) - PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); - TRIE_LIST_NEW(1); - next_alloc = 2; - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - - regnode *noper = REGNODE_AFTER( cur ); - U32 state = 1; /* required init */ - U16 charid = 0; /* sanity init */ - U32 wordlen = 0; /* required init */ - - if (OP(noper) == NOTHING) { - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - /* we will undo this assignment if noper does not - * point at a trieable type in the else clause of - * the following statement. */ - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - const U8 *uc= (U8*)STRING(noper); - const U8 *e= uc + STR_LEN(noper); - - for ( ; uc < e ; uc += len ) { - - TRIE_READ_CHAR; - - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV** const svpp = hv_fetch( widecharmap, - (char*)&uvc, - sizeof( UV ), - 0); - if ( !svpp ) { - charid = 0; - } else { - charid=(U16)SvIV( *svpp ); - } - } - /* charid is now 0 if we dont know the char read, or - * nonzero if we do */ - if ( charid ) { - - U16 check; - U32 newstate = 0; - - charid--; - if ( !trie->states[ state ].trans.list ) { - TRIE_LIST_NEW( state ); - } - for ( check = 1; - check <= TRIE_LIST_USED( state ); - check++ ) - { - if ( TRIE_LIST_ITEM( state, check ).forid - == charid ) - { - newstate = TRIE_LIST_ITEM( state, check ).newstate; - break; - } - } - if ( ! newstate ) { - newstate = next_alloc++; - prev_states[newstate] = state; - TRIE_LIST_PUSH( state, charid, newstate ); - transcount++; - } - state = newstate; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); - } - } - } else { - /* If we end up here it is because we skipped past a NOTHING, but did not end up - * on a trieable type. So we need to reset noper back to point at the first regop - * in the branch before we call TRIE_HANDLE_WORD() - */ - noper= REGNODE_AFTER(cur); - } - TRIE_HANDLE_WORD(state); - - } /* end second pass */ - - /* next alloc is the NEXT state to be allocated */ - trie->statecount = next_alloc; - trie->states = (reg_trie_state *) - PerlMemShared_realloc( trie->states, - next_alloc - * sizeof(reg_trie_state) ); - - /* and now dump it out before we compress it */ - DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, - revcharmap, next_alloc, - depth+1) - ); - - trie->trans = (reg_trie_trans *) - PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); - { - U32 state; - U32 tp = 0; - U32 zp = 0; - - - for( state=1 ; state < next_alloc ; state ++ ) { - U32 base=0; - - /* - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) - ); - */ - - if (trie->states[state].trans.list) { - U16 minid=TRIE_LIST_ITEM( state, 1).forid; - U16 maxid=minid; - U16 idx; - - for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U16 forid = TRIE_LIST_ITEM( state, idx).forid; - if ( forid < minid ) { - minid=forid; - } else if ( forid > maxid ) { - maxid=forid; - } - } - if ( transcount < tp + maxid - minid + 1) { - transcount *= 2; - trie->trans = (reg_trie_trans *) - PerlMemShared_realloc( trie->trans, - transcount - * sizeof(reg_trie_trans) ); - Zero( trie->trans + (transcount / 2), - transcount / 2, - reg_trie_trans ); - } - base = trie->uniquecharcount + tp - minid; - if ( maxid == minid ) { - U32 set = 0; - for ( ; zp < tp ; zp++ ) { - if ( ! trie->trans[ zp ].next ) { - base = trie->uniquecharcount + zp - minid; - trie->trans[ zp ].next = TRIE_LIST_ITEM( state, - 1).newstate; - trie->trans[ zp ].check = state; - set = 1; - break; - } - } - if ( !set ) { - trie->trans[ tp ].next = TRIE_LIST_ITEM( state, - 1).newstate; - trie->trans[ tp ].check = state; - tp++; - zp = tp; - } - } else { - for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { - const U32 tid = base - - trie->uniquecharcount - + TRIE_LIST_ITEM( state, idx ).forid; - trie->trans[ tid ].next = TRIE_LIST_ITEM( state, - idx ).newstate; - trie->trans[ tid ].check = state; - } - tp += ( maxid - minid + 1 ); - } - Safefree(trie->states[ state ].trans.list); - } - /* - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_printf( aTHX_ " base: %d\n",base); - ); - */ - trie->states[ state ].trans.base=base; - } - trie->lasttrans = tp + 1; - } - } else { - /* - Second Pass -- Flat Table Representation. - - we dont use the 0 slot of either trans[] or states[] so we add 1 to - each. We know that we will need Charcount+1 trans at most to store - the data (one row per char at worst case) So we preallocate both - structures assuming worst case. - - We then construct the trie using only the .next slots of the entry - structs. - - We use the .check field of the first entry of the node temporarily - to make compression both faster and easier by keeping track of how - many non zero fields are in the node. - - Since trans are numbered from 1 any 0 pointer in the table is a FAIL - transition. - - There are two terms at use here: state as a TRIE_NODEIDX() which is - a number representing the first entry of the node, and state as a - TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) - and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) - if there are 2 entrys per node. eg: - - A B A B - 1. 2 4 1. 3 7 - 2. 0 3 3. 0 5 - 3. 0 0 5. 0 0 - 4. 0 0 7. 0 0 - - The table is internally in the right hand, idx form. However as we - also have to deal with the states array which is indexed by nodenum - we have to use TRIE_NODENUM() to convert. - - */ - DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", - depth+1)); - - trie->trans = (reg_trie_trans *) - PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) - * trie->uniquecharcount + 1, - sizeof(reg_trie_trans) ); - trie->states = (reg_trie_state *) - PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, - sizeof(reg_trie_state) ); - next_alloc = trie->uniquecharcount + 1; - - - for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - - regnode *noper = REGNODE_AFTER( cur ); - - U32 state = 1; /* required init */ - - U16 charid = 0; /* sanity init */ - U32 accept_state = 0; /* sanity init */ - - U32 wordlen = 0; /* required init */ - - if (OP(noper) == NOTHING) { - regnode *noper_next= regnext(noper); - if (noper_next < tail) - noper= noper_next; - /* we will undo this assignment if noper does not - * point at a trieable type in the else clause of - * the following statement. */ - } - - if ( noper < tail - && ( OP(noper) == flags - || (flags == EXACT && OP(noper) == EXACT_REQ8) - || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 - || OP(noper) == EXACTFUP)))) - { - const U8 *uc= (U8*)STRING(noper); - const U8 *e= uc + STR_LEN(noper); - - for ( ; uc < e ; uc += len ) { - - TRIE_READ_CHAR; - - if ( uvc < 256 ) { - charid = trie->charmap[ uvc ]; - } else { - SV* const * const svpp = hv_fetch( widecharmap, - (char*)&uvc, - sizeof( UV ), - 0); - charid = svpp ? (U16)SvIV(*svpp) : 0; - } - if ( charid ) { - charid--; - if ( !trie->trans[ state + charid ].next ) { - trie->trans[ state + charid ].next = next_alloc; - trie->trans[ state ].check++; - prev_states[TRIE_NODENUM(next_alloc)] - = TRIE_NODENUM(state); - next_alloc += trie->uniquecharcount; - } - state = trie->trans[ state + charid ].next; - } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); - } - /* charid is now 0 if we dont know the char read, or - * nonzero if we do */ - } - } else { - /* If we end up here it is because we skipped past a NOTHING, but did not end up - * on a trieable type. So we need to reset noper back to point at the first regop - * in the branch before we call TRIE_HANDLE_WORD(). - */ - noper= REGNODE_AFTER(cur); - } - accept_state = TRIE_NODENUM( state ); - TRIE_HANDLE_WORD(accept_state); - - } /* end second pass */ - - /* and now dump it out before we compress it */ - DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, - revcharmap, - next_alloc, depth+1)); - - { - /* - * Inplace compress the table.* - - For sparse data sets the table constructed by the trie algorithm will - be mostly 0/FAIL transitions or to put it another way mostly empty. - (Note that leaf nodes will not contain any transitions.) - - This algorithm compresses the tables by eliminating most such - transitions, at the cost of a modest bit of extra work during lookup: - - - Each states[] entry contains a .base field which indicates the - index in the state[] array wheres its transition data is stored. - - - If .base is 0 there are no valid transitions from that node. - - - If .base is nonzero then charid is added to it to find an entry in - the trans array. - - -If trans[states[state].base+charid].check!=state then the - transition is taken to be a 0/Fail transition. Thus if there are fail - transitions at the front of the node then the .base offset will point - somewhere inside the previous nodes data (or maybe even into a node - even earlier), but the .check field determines if the transition is - valid. - - XXX - wrong maybe? - The following process inplace converts the table to the compressed - table: We first do not compress the root node 1,and mark all its - .check pointers as 1 and set its .base pointer as 1 as well. This - allows us to do a DFA construction from the compressed table later, - and ensures that any .base pointers we calculate later are greater - than 0. - - - We set 'pos' to indicate the first entry of the second node. - - - We then iterate over the columns of the node, finding the first and - last used entry at l and m. We then copy l..m into pos..(pos+m-l), - and set the .check pointers accordingly, and advance pos - appropriately and repreat for the next node. Note that when we copy - the next pointers we have to convert them from the original - NODEIDX form to NODENUM form as the former is not valid post - compression. - - - If a node has no transitions used we mark its base as 0 and do not - advance the pos pointer. - - - If a node only has one transition we use a second pointer into the - structure to fill in allocated fail transitions from other states. - This pointer is independent of the main pointer and scans forward - looking for null transitions that are allocated to a state. When it - finds one it writes the single transition into the "hole". If the - pointer doesnt find one the single transition is appended as normal. - - - Once compressed we can Renew/realloc the structures to release the - excess space. - - See "Table-Compression Methods" in sec 3.9 of the Red Dragon, - specifically Fig 3.47 and the associated pseudocode. - - demq - */ - const U32 laststate = TRIE_NODENUM( next_alloc ); - U32 state, charid; - U32 pos = 0, zp=0; - trie->statecount = laststate; - - for ( state = 1 ; state < laststate ; state++ ) { - U8 flag = 0; - const U32 stateidx = TRIE_NODEIDX( state ); - const U32 o_used = trie->trans[ stateidx ].check; - U32 used = trie->trans[ stateidx ].check; - trie->trans[ stateidx ].check = 0; - - for ( charid = 0; - used && charid < trie->uniquecharcount; - charid++ ) - { - if ( flag || trie->trans[ stateidx + charid ].next ) { - if ( trie->trans[ stateidx + charid ].next ) { - if (o_used == 1) { - for ( ; zp < pos ; zp++ ) { - if ( ! trie->trans[ zp ].next ) { - break; - } - } - trie->states[ state ].trans.base - = zp - + trie->uniquecharcount - - charid ; - trie->trans[ zp ].next - = SAFE_TRIE_NODENUM( trie->trans[ stateidx - + charid ].next ); - trie->trans[ zp ].check = state; - if ( ++zp > pos ) pos = zp; - break; - } - used--; - } - if ( !flag ) { - flag = 1; - trie->states[ state ].trans.base - = pos + trie->uniquecharcount - charid ; - } - trie->trans[ pos ].next - = SAFE_TRIE_NODENUM( - trie->trans[ stateidx + charid ].next ); - trie->trans[ pos ].check = state; - pos++; - } - } - } - trie->lasttrans = pos + 1; - trie->states = (reg_trie_state *) - PerlMemShared_realloc( trie->states, laststate - * sizeof(reg_trie_state) ); - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", - depth+1, - (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount - + 1 ), - (IV)next_alloc, - (IV)pos, - ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); - ); - - } /* end table compress */ - } - DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", - depth+1, - (UV)trie->statecount, - (UV)trie->lasttrans) - ); - /* resize the trans array to remove unused space */ - trie->trans = (reg_trie_trans *) - PerlMemShared_realloc( trie->trans, trie->lasttrans - * sizeof(reg_trie_trans) ); - - { /* Modify the program and insert the new TRIE node */ - U8 nodetype =(U8) flags; - char *str=NULL; - -#ifdef DEBUGGING - regnode *optimize = NULL; -#endif /* DEBUGGING */ - /* - This means we convert either the first branch or the first Exact, - depending on whether the thing following (in 'last') is a branch - or not and whther first is the startbranch (ie is it a sub part of - the alternation or is it the whole thing.) - Assuming its a sub part we convert the EXACT otherwise we convert - the whole branch sequence, including the first. - */ - /* Find the node we are going to overwrite */ - if ( first != startbranch || OP( last ) == BRANCH ) { - /* branch sub-chain */ - NEXT_OFF( first ) = (U16)(last - first); - /* whole branch chain */ - } - /* But first we check to see if there is a common prefix we can - split out as an EXACT and put in front of the TRIE node. */ - trie->startstate= 1; - if ( trie->bitmap && !widecharmap && !trie->jump ) { - /* we want to find the first state that has more than - * one transition, if that state is not the first state - * then we have a common prefix which we can remove. - */ - U32 state; - for ( state = 1 ; state < trie->statecount-1 ; state++ ) { - U32 ofs = 0; - I32 first_ofs = -1; /* keeps track of the ofs of the first - transition, -1 means none */ - U32 count = 0; - const U32 base = trie->states[ state ].trans.base; - - /* does this state terminate an alternation? */ - if ( trie->states[state].wordnum ) - count = 1; - - for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { - if ( ( base + ofs >= trie->uniquecharcount ) && - ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && - trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) - { - if ( ++count > 1 ) { - /* we have more than one transition */ - SV **tmp; - U8 *ch; - /* if this is the first state there is no common prefix - * to extract, so we can exit */ - if ( state == 1 ) break; - tmp = av_fetch_simple( revcharmap, ofs, 0); - ch = (U8*)SvPV_nolen_const( *tmp ); - - /* if we are on count 2 then we need to initialize the - * bitmap, and store the previous char if there was one - * in it*/ - if ( count == 2 ) { - /* clear the bitmap */ - Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); - DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", - depth+1, - (UV)state)); - if (first_ofs >= 0) { - SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0); - const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); - - TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); - DEBUG_OPTIMISE_r( - Perl_re_printf( aTHX_ "%s", (char*)ch) - ); - } - } - /* store the current firstchar in the bitmap */ - TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); - } - first_ofs = ofs; - } - } - if ( count == 1 ) { - /* This state has only one transition, its transition is part - * of a common prefix - we need to concatenate the char it - * represents to what we have so far. */ - SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0); - STRLEN len; - char *ch = SvPV( *tmp, len ); - DEBUG_OPTIMISE_r({ - SV *sv=sv_newmortal(); - Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", - depth+1, - (UV)state, (UV)first_ofs, - pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, - PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_ESCAPE_FIRSTCHAR - ) - ); - }); - if ( state==1 ) { - OP( convert ) = nodetype; - str=STRING(convert); - setSTR_LEN(convert, 0); - } - assert( ( STR_LEN(convert) + len ) < 256 ); - setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); - while (len--) - *str++ = *ch++; - } else { -#ifdef DEBUGGING - if (state>1) - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); -#endif - break; - } - } - trie->prefixlen = (state-1); - if (str) { - regnode *n = REGNODE_AFTER(convert); - assert( n - convert <= U16_MAX ); - NEXT_OFF(convert) = n - convert; - trie->startstate = state; - trie->minlen -= (state - 1); - trie->maxlen -= (state - 1); -#ifdef DEBUGGING - /* At least the UNICOS C compiler choked on this - * being argument to DEBUG_r(), so let's just have - * it right here. */ - if ( -#ifdef PERL_EXT_RE_BUILD - 1 -#else - DEBUG_r_TEST -#endif - ) { - U32 word = trie->wordcount; - while (word--) { - SV ** const tmp = av_fetch_simple( trie_words, word, 0 ); - if (tmp) { - if ( STR_LEN(convert) <= SvCUR(*tmp) ) - sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); - else - sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); - } - } - } -#endif - if (trie->maxlen) { - convert = n; - } else { - NEXT_OFF(convert) = (U16)(tail - convert); - DEBUG_r(optimize= n); - } - } - } - if (!jumper) - jumper = last; - if ( trie->maxlen ) { - NEXT_OFF( convert ) = (U16)(tail - convert); - ARG_SET( convert, data_slot ); - /* Store the offset to the first unabsorbed branch in - jump[0], which is otherwise unused by the jump logic. - We use this when dumping a trie and during optimisation. */ - if (trie->jump) - trie->jump[0] = (U16)(nextbranch - convert); - - /* If the start state is not accepting (meaning there is no empty string/NOTHING) - * and there is a bitmap - * and the first "jump target" node we found leaves enough room - * then convert the TRIE node into a TRIEC node, with the bitmap - * embedded inline in the opcode - this is hypothetically faster. - */ - if ( !trie->states[trie->startstate].wordnum - && trie->bitmap - && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) - { - OP( convert ) = TRIEC; - Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); - PerlMemShared_free(trie->bitmap); - trie->bitmap= NULL; - } else - OP( convert ) = TRIE; - - /* store the type in the flags */ - convert->flags = nodetype; - DEBUG_r({ - optimize = convert - + NODE_STEP_REGNODE - + REGNODE_ARG_LEN( OP( convert ) ); - }); - /* XXX We really should free up the resource in trie now, - as we won't use them - (which resources?) dmq */ - } - /* needed for dumping*/ - DEBUG_r(if (optimize) { - /* - Try to clean up some of the debris left after the - optimisation. - */ - while( optimize < jumper ) { - OP( optimize ) = OPTIMIZED; - optimize++; - } - }); - } /* end node insert */ - - /* Finish populating the prev field of the wordinfo array. Walk back - * from each accept state until we find another accept state, and if - * so, point the first word's .prev field at the second word. If the - * second already has a .prev field set, stop now. This will be the - * case either if we've already processed that word's accept state, - * or that state had multiple words, and the overspill words were - * already linked up earlier. - */ - { - U16 word; - U32 state; - U16 prev; - - for (word=1; word <= trie->wordcount; word++) { - prev = 0; - if (trie->wordinfo[word].prev) - continue; - state = trie->wordinfo[word].accept; - while (state) { - state = prev_states[state]; - if (!state) - break; - prev = trie->states[state].wordnum; - if (prev) - break; - } - trie->wordinfo[word].prev = prev; - } - Safefree(prev_states); - } - - - /* and now dump out the compressed format */ - DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); - - RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; -#ifdef DEBUGGING - RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; - RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; -#else - SvREFCNT_dec_NN(revcharmap); -#endif - return trie->jump - ? MADE_JUMP_TRIE - : trie->startstate>1 - ? MADE_EXACT_TRIE - : MADE_TRIE; -} - -STATIC regnode * -S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) -{ -/* The Trie is constructed and compressed now so we can build a fail array if - * it's needed - - This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and - 3.32 in the - "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, - Ullman 1985/88 - ISBN 0-201-10088-6 - - We find the fail state for each state in the trie, this state is the longest - proper suffix of the current state's 'word' that is also a proper prefix of - another word in our trie. State 1 represents the word '' and is thus the - default fail state. This allows the DFA not to have to restart after its - tried and failed a word at a given point, it simply continues as though it - had been matching the other word in the first place. - Consider - 'abcdgu'=~/abcdefg|cdgu/ - When we get to 'd' we are still matching the first word, we would encounter - 'g' which would fail, which would bring us to the state representing 'd' in - the second word where we would try 'g' and succeed, proceeding to match - 'cdgu'. - */ - /* add a fail transition */ - const U32 trie_offset = ARG(source); - reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; - U32 *q; - const U32 ucharcount = trie->uniquecharcount; - const U32 numstates = trie->statecount; - const U32 ubound = trie->lasttrans + ucharcount; - U32 q_read = 0; - U32 q_write = 0; - U32 charid; - U32 base = trie->states[ 1 ].trans.base; - U32 *fail; - reg_ac_data *aho; - const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); - regnode *stclass; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; - PERL_UNUSED_CONTEXT; -#ifndef DEBUGGING - PERL_UNUSED_ARG(depth); -#endif - - if ( OP(source) == TRIE ) { - struct regnode_1 *op = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(source, op, struct regnode_1); - stclass = (regnode *)op; - } else { - struct regnode_charclass *op = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(source, op, struct regnode_charclass); - stclass = (regnode *)op; - } - OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ - - ARG_SET( stclass, data_slot ); - aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); - RExC_rxi->data->data[ data_slot ] = (void*)aho; - aho->trie=trie_offset; - aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); - Copy( trie->states, aho->states, numstates, reg_trie_state ); - Newx( q, numstates, U32); - aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); - aho->refcount = 1; - fail = aho->fail; - /* initialize fail[0..1] to be 1 so that we always have - a valid final fail state */ - fail[ 0 ] = fail[ 1 ] = 1; - - for ( charid = 0; charid < ucharcount ; charid++ ) { - const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); - if ( newstate ) { - q[ q_write ] = newstate; - /* set to point at the root */ - fail[ q[ q_write++ ] ]=1; - } - } - while ( q_read < q_write) { - const U32 cur = q[ q_read++ % numstates ]; - base = trie->states[ cur ].trans.base; - - for ( charid = 0 ; charid < ucharcount ; charid++ ) { - const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); - if (ch_state) { - U32 fail_state = cur; - U32 fail_base; - do { - fail_state = fail[ fail_state ]; - fail_base = aho->states[ fail_state ].trans.base; - } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); - - fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); - fail[ ch_state ] = fail_state; - if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) - { - aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; - } - q[ q_write++ % numstates] = ch_state; - } - } - } - /* restore fail[0..1] to 0 so that we "fall out" of the AC loop - when we fail in state 1, this allows us to use the - charclass scan to find a valid start char. This is based on the principle - that theres a good chance the string being searched contains lots of stuff - that cant be a start char. - */ - fail[ 0 ] = fail[ 1 ] = 0; - DEBUG_TRIE_COMPILE_r({ - Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", - depth, (UV)numstates - ); - for( q_read=1; q_read<numstates; q_read++ ) { - Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); - } - Perl_re_printf( aTHX_ "\n"); - }); - Safefree(q); - /*RExC_seen |= REG_TRIEDFA_SEEN;*/ - return stclass; -} - - -/* The below joins as many adjacent EXACTish nodes as possible into a single - * one. The regop may be changed if the node(s) contain certain sequences that - * require special handling. The joining is only done if: - * 1) there is room in the current conglomerated node to entirely contain the - * next one. - * 2) they are compatible node types - * - * The adjacent nodes actually may be separated by NOTHING-kind nodes, and - * these get optimized out - * - * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full - * as possible, even if that means splitting an existing node so that its first - * part is moved to the preceding node. This would maximise the efficiency of - * memEQ during matching. - * - * If a node is to match under /i (folded), the number of characters it matches - * can be different than its character length if it contains a multi-character - * fold. *min_subtract is set to the total delta number of characters of the - * input nodes. - * - * And *unfolded_multi_char is set to indicate whether or not the node contains - * an unfolded multi-char fold. This happens when it won't be known until - * runtime whether the fold is valid or not; namely - * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the - * target string being matched against turns out to be UTF-8 is that fold - * valid; or - * 2) for EXACTFL nodes whose folding rules depend on the locale in force at - * runtime. - * (Multi-char folds whose components are all above the Latin1 range are not - * run-time locale dependent, and have already been folded by the time this - * function is called.) - * - * This is as good a place as any to discuss the design of handling these - * multi-character fold sequences. It's been wrong in Perl for a very long - * time. There are three code points in Unicode whose multi-character folds - * were long ago discovered to mess things up. The previous designs for - * dealing with these involved assigning a special node for them. This - * approach doesn't always work, as evidenced by this example: - * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both sides fold to "sss", but if the pattern is parsed to create a node that - * would match just the \xDF, it won't be able to handle the case where a - * successful match would have to cross the node's boundary. The new approach - * that hopefully generally solves the problem generates an EXACTFUP node - * that is "sss" in this case. - * - * It turns out that there are problems with all multi-character folds, and not - * just these three. Now the code is general, for all such cases. The - * approach taken is: - * 1) This routine examines each EXACTFish node that could contain multi- - * character folded sequences. Since a single character can fold into - * such a sequence, the minimum match length for this node is less than - * the number of characters in the node. This routine returns in - * *min_subtract how many characters to subtract from the actual - * length of the string to get a real minimum match length; it is 0 if - * there are no multi-char foldeds. This delta is used by the caller to - * adjust the min length of the match, and the delta between min and max, - * so that the optimizer doesn't reject these possibilities based on size - * constraints. - * - * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) - * under /u, we fold it to 'ss' in regatom(), and in this routine, after - * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 - * EXACTFU nodes. The node type of such nodes is then changed to - * EXACTFUP, indicating it is problematic, and needs careful handling. - * (The procedures in step 1) above are sufficient to handle this case in - * UTF-8 encoded nodes.) The reason this is problematic is that this is - * the only case where there is a possible fold length change in non-UTF-8 - * patterns. By reserving a special node type for problematic cases, the - * far more common regular EXACTFU nodes can be processed faster. - * regexec.c takes advantage of this. - * - * EXACTFUP has been created as a grab-bag for (hopefully uncommon) - * problematic cases. These all only occur when the pattern is not - * UTF-8. In addition to the 'ss' sequence where there is a possible fold - * length change, it handles the situation where the string cannot be - * entirely folded. The strings in an EXACTFish node are folded as much - * as possible during compilation in regcomp.c. This saves effort in - * regex matching. By using an EXACTFUP node when it is not possible to - * fully fold at compile time, regexec.c can know that everything in an - * EXACTFU node is folded, so folding can be skipped at runtime. The only - * case where folding in EXACTFU nodes can't be done at compile time is - * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This - * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes - * handle two very different cases. Alternatively, there could have been - * a node type where there are length changes, one for unfolded, and one - * for both. If yet another special case needed to be created, the number - * of required node types would have to go to 7. khw figures that even - * though there are plenty of node types to spare, that the maintenance - * cost wasn't worth the small speedup of doing it that way, especially - * since he thinks the MICRO SIGN is rarely encountered in practice. - * - * There are other cases where folding isn't done at compile time, but - * none of them are under /u, and hence not for EXACTFU nodes. The folds - * in EXACTFL nodes aren't known until runtime, and vary as the locale - * changes. Some folds in EXACTF depend on if the runtime target string - * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di - * when no fold in it depends on the UTF-8ness of the target string.) - * - * 3) A problem remains for unfolded multi-char folds. (These occur when the - * validity of the fold won't be known until runtime, and so must remain - * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA - * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot - * be an EXACTF node with a UTF-8 pattern.) They also occur for various - * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) - * The reason this is a problem is that the optimizer part of regexec.c - * (probably unwittingly, in Perl_regexec_flags()) makes an assumption - * that a character in the pattern corresponds to at most a single - * character in the target string. (And I do mean character, and not byte - * here, unlike other parts of the documentation that have never been - * updated to account for multibyte Unicode.) Sharp s in EXACTF and - * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA - * nodes it can match "\x{17F}\x{17F}". These, along with other ones in - * EXACTFL nodes, violate the assumption, and they are the only instances - * where it is violated. I'm reluctant to try to change the assumption, - * as the code involved is impenetrable to me (khw), so instead the code - * here punts. This routine examines EXACTFL nodes, and (when the pattern - * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a - * boolean indicating whether or not the node contains such a fold. When - * it is true, the caller sets a flag that later causes the optimizer in - * this file to not set values for the floating and fixed string lengths, - * and thus avoids the optimizer code in regexec.c that makes the invalid - * assumption. Thus, there is no optimization based on string lengths for - * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern - * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the - * assumption is wrong only in these cases is that all other non-UTF-8 - * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to - * their expanded versions. (Again, we can't prefold sharp s to 'ss' in - * EXACTF nodes because we don't know at compile time if it actually - * matches 'ss' or not. For EXACTF nodes it will match iff the target - * string is in UTF-8. This is in contrast to EXACTFU nodes, where it - * always matches; and EXACTFAA where it never does. In an EXACTFAA node - * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the - * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 - * string would require the pattern to be forced into UTF-8, the overhead - * of which we want to avoid. Similarly the unfolded multi-char folds in - * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 - * locale.) - * - * Similarly, the code that generates tries doesn't currently handle - * not-already-folded multi-char folds, and it looks like a pain to change - * that. Therefore, trie generation of EXACTFAA nodes with the sharp s - * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, - * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people - * using /iaa matching will be doing so almost entirely with ASCII - * strings, so this should rarely be encountered in practice */ - -STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, - UV *min_subtract, bool *unfolded_multi_char, - U32 flags, regnode *val, U32 depth) -{ - /* Merge several consecutive EXACTish nodes into one. */ - - regnode *n = regnext(scan); - U32 stringok = 1; - regnode *next = REGNODE_AFTER_varies(scan); - U32 merged = 0; - U32 stopnow = 0; -#ifdef DEBUGGING - regnode *stop = scan; - DECLARE_AND_GET_RE_DEBUG_FLAGS; -#else - PERL_UNUSED_ARG(depth); -#endif - - PERL_ARGS_ASSERT_JOIN_EXACT; -#ifndef EXPERIMENTAL_INPLACESCAN - PERL_UNUSED_ARG(flags); - PERL_UNUSED_ARG(val); -#endif - DEBUG_PEEP("join", scan, depth, 0); - - assert(REGNODE_TYPE(OP(scan)) == EXACT); - - /* Look through the subsequent nodes in the chain. Skip NOTHING, merge - * EXACT ones that are mergeable to the current one. */ - while ( n - && ( REGNODE_TYPE(OP(n)) == NOTHING - || (stringok && REGNODE_TYPE(OP(n)) == EXACT)) - && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) - { - - if (OP(n) == TAIL || n > next) - stringok = 0; - if (REGNODE_TYPE(OP(n)) == NOTHING) { - DEBUG_PEEP("skip:", n, depth, 0); - NEXT_OFF(scan) += NEXT_OFF(n); - next = n + NODE_STEP_REGNODE; -#ifdef DEBUGGING - if (stringok) - stop = n; -#endif - n = regnext(n); - } - else if (stringok) { - const unsigned int oldl = STR_LEN(scan); - regnode * const nnext = regnext(n); - - /* XXX I (khw) kind of doubt that this works on platforms (should - * Perl ever run on one) where U8_MAX is above 255 because of lots - * of other assumptions */ - /* Don't join if the sum can't fit into a single node */ - if (oldl + STR_LEN(n) > U8_MAX) - break; - - /* Joining something that requires UTF-8 with something that - * doesn't, means the result requires UTF-8. */ - if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { - OP(scan) = EXACT_REQ8; - } - else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { - ; /* join is compatible, no need to change OP */ - } - else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { - OP(scan) = EXACTFU_REQ8; - } - else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { - - /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, - * which can join with EXACTFU ones. We check for this case - * here. These need to be resolved to either EXACTFU or - * EXACTF at joining time. They have nothing in them that - * would forbid them from being the more desirable EXACTFU - * nodes except that they begin and/or end with a single [Ss]. - * The reason this is problematic is because they could be - * joined in this loop with an adjacent node that ends and/or - * begins with [Ss] which would then form the sequence 'ss', - * which matches differently under /di than /ui, in which case - * EXACTFU can't be used. If the 'ss' sequence doesn't get - * formed, the nodes get absorbed into any adjacent EXACTFU - * node. And if the only adjacent node is EXACTF, they get - * absorbed into that, under the theory that a longer node is - * better than two shorter ones, even if one is EXACTFU. Note - * that EXACTFU_REQ8 is generated only for UTF-8 patterns, - * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ - - if (STRING(n)[STR_LEN(n)-1] == 's') { - - /* Here the joined node would end with 's'. If the node - * following the combination is an EXACTF one, it's better to - * join this trailing edge 's' node with that one, leaving the - * current one in 'scan' be the more desirable EXACTFU */ - if (OP(nnext) == EXACTF) { - break; - } - - OP(scan) = EXACTFU_S_EDGE; - - } /* Otherwise, the beginning 's' of the 2nd node just - becomes an interior 's' in 'scan' */ - } - else if (OP(scan) == EXACTF && OP(n) == EXACTF) { - ; /* join is compatible, no need to change OP */ - } - else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { - - /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE - * nodes. But the latter nodes can be also joined with EXACTFU - * ones, and that is a better outcome, so if the node following - * 'n' is EXACTFU, quit now so that those two can be joined - * later */ - if (OP(nnext) == EXACTFU) { - break; - } - - /* The join is compatible, and the combined node will be - * EXACTF. (These don't care if they begin or end with 's' */ - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { - if ( STRING(scan)[STR_LEN(scan)-1] == 's' - && STRING(n)[0] == 's') - { - /* When combined, we have the sequence 'ss', which means we - * have to remain /di */ - OP(scan) = EXACTF; - } - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { - if (STRING(n)[0] == 's') { - ; /* Here the join is compatible and the combined node - starts with 's', no need to change OP */ - } - else { /* Now the trailing 's' is in the interior */ - OP(scan) = EXACTFU; - } - } - else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { - - /* The join is compatible, and the combined node will be - * EXACTF. (These don't care if they begin or end with 's' */ - OP(scan) = EXACTF; - } - else if (OP(scan) != OP(n)) { - - /* The only other compatible joinings are the same node type */ - break; - } - - DEBUG_PEEP("merg", n, depth, 0); - merged++; - - next = REGNODE_AFTER_varies(n); - NEXT_OFF(scan) += NEXT_OFF(n); - assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); - setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); - /* Now we can overwrite *n : */ - Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); -#ifdef DEBUGGING - stop = next - 1; -#endif - n = nnext; - if (stopnow) break; - } - -#ifdef EXPERIMENTAL_INPLACESCAN - if (flags && !NEXT_OFF(n)) { - DEBUG_PEEP("atch", val, depth, 0); - if (REGNODE_OFF_BY_ARG(OP(n))) { - ARG_SET(n, val - n); - } - else { - NEXT_OFF(n) = val - n; - } - stopnow = 1; - } -#endif - } - - /* This temporary node can now be turned into EXACTFU, and must, as - * regexec.c doesn't handle it */ - if (OP(scan) == EXACTFU_S_EDGE) { - OP(scan) = EXACTFU; - } - - *min_subtract = 0; - *unfolded_multi_char = FALSE; - - /* Here, all the adjacent mergeable EXACTish nodes have been merged. We - * can now analyze for sequences of problematic code points. (Prior to - * this final joining, sequences could have been split over boundaries, and - * hence missed). The sequences only happen in folding, hence for any - * non-EXACT EXACTish node */ - if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { - U8* s0 = (U8*) STRING(scan); - U8* s = s0; - U8* s_end = s0 + STR_LEN(scan); - - int total_count_delta = 0; /* Total delta number of characters that - multi-char folds expand to */ - - /* One pass is made over the node's string looking for all the - * possibilities. To avoid some tests in the loop, there are two main - * cases, for UTF-8 patterns (which can't have EXACTF nodes) and - * non-UTF-8 */ - if (UTF) { - U8* folded = NULL; - - if (OP(scan) == EXACTFL) { - U8 *d; - - /* An EXACTFL node would already have been changed to another - * node type unless there is at least one character in it that - * is problematic; likely a character whose fold definition - * won't be known until runtime, and so has yet to be folded. - * For all but the UTF-8 locale, folds are 1-1 in length, but - * to handle the UTF-8 case, we need to create a temporary - * folded copy using UTF-8 locale rules in order to analyze it. - * This is because our macros that look to see if a sequence is - * a multi-char fold assume everything is folded (otherwise the - * tests in those macros would be too complicated and slow). - * Note that here, the non-problematic folds will have already - * been done, so we can just copy such characters. We actually - * don't completely fold the EXACTFL string. We skip the - * unfolded multi-char folds, as that would just create work - * below to figure out the size they already are */ - - Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); - d = folded; - while (s < s_end) { - STRLEN s_len = UTF8SKIP(s); - if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { - Copy(s, d, s_len, U8); - d += s_len; - } - else if (is_FOLDS_TO_MULTI_utf8(s)) { - *unfolded_multi_char = TRUE; - Copy(s, d, s_len, U8); - d += s_len; - } - else if (isASCII(*s)) { - *(d++) = toFOLD(*s); - } - else { - STRLEN len; - _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); - d += len; - } - s += s_len; - } - - /* Point the remainder of the routine to look at our temporary - * folded copy */ - s = folded; - s_end = d; - } /* End of creating folded copy of EXACTFL string */ - - /* Examine the string for a multi-character fold sequence. UTF-8 - * patterns have all characters pre-folded by the time this code is - * executed */ - while (s < s_end - 1) /* Can stop 1 before the end, as minimum - length sequence we are looking for is 2 */ - { - int count = 0; /* How many characters in a multi-char fold */ - int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); - if (! len) { /* Not a multi-char fold: get next char */ - s += UTF8SKIP(s); - continue; - } - - { /* Here is a generic multi-char fold. */ - U8* multi_end = s + len; - - /* Count how many characters are in it. In the case of - * /aa, no folds which contain ASCII code points are - * allowed, so check for those, and skip if found. */ - if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { - count = utf8_length(s, multi_end); - s = multi_end; - } - else { - while (s < multi_end) { - if (isASCII(*s)) { - s++; - goto next_iteration; - } - else { - s += UTF8SKIP(s); - } - count++; - } - } - } - - /* The delta is how long the sequence is minus 1 (1 is how long - * the character that folds to the sequence is) */ - total_count_delta += count - 1; - next_iteration: ; - } - - /* We created a temporary folded copy of the string in EXACTFL - * nodes. Therefore we need to be sure it doesn't go below zero, - * as the real string could be shorter */ - if (OP(scan) == EXACTFL) { - int total_chars = utf8_length((U8*) STRING(scan), - (U8*) STRING(scan) + STR_LEN(scan)); - if (total_count_delta > total_chars) { - total_count_delta = total_chars; - } - } - - *min_subtract += total_count_delta; - Safefree(folded); - } - else if (OP(scan) == EXACTFAA) { - - /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char - * fold to the ASCII range (and there are no existing ones in the - * upper latin1 range). But, as outlined in the comments preceding - * this function, we need to flag any occurrences of the sharp s. - * This character forbids trie formation (because of added - * complexity) */ -#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ - || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ - || UNICODE_DOT_DOT_VERSION > 0) - while (s < s_end) { - if (*s == LATIN_SMALL_LETTER_SHARP_S) { - OP(scan) = EXACTFAA_NO_TRIE; - *unfolded_multi_char = TRUE; - break; - } - s++; - } - } - else if (OP(scan) != EXACTFAA_NO_TRIE) { - - /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char - * folds that are all Latin1. As explained in the comments - * preceding this function, we look also for the sharp s in EXACTF - * and EXACTFL nodes; it can be in the final position. Otherwise - * we can stop looking 1 byte earlier because have to find at least - * two characters for a multi-fold */ - const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) - ? s_end - : s_end -1; - - while (s < upper) { - int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); - if (! len) { /* Not a multi-char fold. */ - if (*s == LATIN_SMALL_LETTER_SHARP_S - && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) - { - *unfolded_multi_char = TRUE; - } - s++; - continue; - } - - if (len == 2 - && isALPHA_FOLD_EQ(*s, 's') - && isALPHA_FOLD_EQ(*(s+1), 's')) - { - - /* EXACTF nodes need to know that the minimum length - * changed so that a sharp s in the string can match this - * ss in the pattern, but they remain EXACTF nodes, as they - * won't match this unless the target string is in UTF-8, - * which we don't know until runtime. EXACTFL nodes can't - * transform into EXACTFU nodes */ - if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { - OP(scan) = EXACTFUP; - } - } - - *min_subtract += len - 1; - s += len; - } -#endif - } - } - -#ifdef DEBUGGING - /* Allow dumping but overwriting the collection of skipped - * ops and/or strings with fake optimized ops */ - n = REGNODE_AFTER_varies(scan); - while (n <= stop) { - OP(n) = OPTIMIZED; - FLAGS(n) = 0; - NEXT_OFF(n) = 0; - n++; - } -#endif - DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); - return stopnow; -} - -/* REx optimizer. Converts nodes into quicker variants "in place". - Finds fixed substrings. */ - -/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set - to the position after last scanned or to NULL. */ - -#define INIT_AND_WITHP \ - assert(!and_withp); \ - Newx(and_withp, 1, regnode_ssc); \ - SAVEFREEPV(and_withp) - - -static void -S_unwind_scan_frames(pTHX_ const void *p) -{ - scan_frame *f= (scan_frame *)p; - do { - scan_frame *n= f->next_frame; - Safefree(f); - f= n; - } while (f); -} - -/* Follow the next-chain of the current node and optimize away - all the NOTHINGs from it. - */ -STATIC void -S_rck_elide_nothing(pTHX_ regnode *node) -{ - PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; - - if (OP(node) != CURLYX) { - const int max = (REGNODE_OFF_BY_ARG(OP(node)) - ? I32_MAX - /* I32 may be smaller than U16 on CRAYs! */ - : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); - int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node)); - int noff; - regnode *n = node; - - /* Skip NOTHING and LONGJMP. */ - while ( - (n = regnext(n)) - && ( - (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n))) - || ((OP(n) == LONGJMP) && (noff = ARG(n))) - ) - && off + noff < max - ) { - off += noff; - } - if (REGNODE_OFF_BY_ARG(OP(node))) - ARG(node) = off; - else - NEXT_OFF(node) = off; - } - return; -} - -/* the return from this sub is the minimum length that could possibly match */ -STATIC SSize_t -S_study_chunk(pTHX_ - RExC_state_t *pRExC_state, - regnode **scanp, /* Start here (read-write). */ - SSize_t *minlenp, /* used for the minlen of substrings? */ - SSize_t *deltap, /* Write maxlen-minlen here. */ - regnode *last, /* Stop before this one. */ - scan_data_t *data, /* string data about the pattern */ - I32 stopparen, /* treat CLOSE-N as END, see GOSUB */ - U32 recursed_depth, /* how deep have we recursed via GOSUB */ - regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */ - U32 flags, /* flags controlling this call, see SCF_ flags */ - U32 depth, /* how deep have we recursed period */ - bool was_mutate_ok /* TRUE if in-place optimizations are allowed. - FALSE only if the caller (recursively) was - prohibited from modifying the regops, because - a higher caller is holding a ptr to them. */ -) -{ - /* vars about the regnodes we are working with */ - regnode *scan = *scanp; /* the current opcode we are inspecting */ - regnode *next = NULL; /* the next opcode beyond scan, tmp var */ - regnode *first_non_open = scan; /* FIXME: should this init to NULL? - the first non open regop, if the init - val IS an OPEN then we will skip past - it just after the var decls section */ - I32 code = 0; /* temp var used to hold the optype of a regop */ - - /* vars about the min and max length of the pattern */ - SSize_t min = 0; /* min length of this part of the pattern */ - SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT - this is adjusted down if we find - an ACCEPT */ - SSize_t delta = 0; /* difference between min and max length - (not accounting for stopmin) */ - - /* vars about capture buffers in the pattern */ - I32 pars = 0; /* count of OPEN opcodes */ - I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */ - - /* vars about whether this pattern contains something that can match - * infinitely long strings, eg, X* or X+ */ - int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); - int is_inf_internal = 0; /* The studied chunk is infinite */ - - /* scan_data_t (struct) is used to hold information about the substrings - * and start class we have extracted from the string */ - scan_data_t data_fake; /* temp var used for recursing in some cases */ - - SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do - trie optimizations */ - - scan_frame *frame = NULL; /* used as part of fake recursion */ - - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_STUDY_CHUNK; - RExC_study_started= 1; - - Zero(&data_fake, 1, scan_data_t); - - if ( depth == 0 ) { - while (first_non_open && OP(first_non_open) == OPEN) - first_non_open=regnext(first_non_open); - } - - fake_study_recurse: - DEBUG_r( - RExC_study_chunk_recursed_count++; - ); - DEBUG_OPTIMISE_MORE_r( - { - Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", - depth, (long)stopparen, - (unsigned long)RExC_study_chunk_recursed_count, - (unsigned long)depth, (unsigned long)recursed_depth, - scan, - last); - if (recursed_depth) { - U32 i; - U32 j; - for ( j = 0 ; j < recursed_depth ; j++ ) { - for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { - if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { - Perl_re_printf( aTHX_ " %d",(int)i); - break; - } - } - if ( j + 1 < recursed_depth ) { - Perl_re_printf( aTHX_ ","); - } - } - } - Perl_re_printf( aTHX_ "\n"); - } - ); - while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How mmany chars to subtract from the minimum - node length to get a real minimum (because - the folded version may be shorter) */ - bool unfolded_multi_char = FALSE; - /* avoid mutating ops if we are anywhere within the recursed or - * enframed handling for a GOSUB: the outermost level will handle it. - */ - bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); - /* Peephole optimizer: */ - DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("Peep", scan, depth, flags); - - - /* The reason we do this here is that we need to deal with things like - * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT - * parsing code, as each (?:..) is handled by a different invocation of - * reg() -- Yves - */ - if (REGNODE_TYPE(OP(scan)) == EXACT - && OP(scan) != LEXACT - && OP(scan) != LEXACT_REQ8 - && mutate_ok - ) { - join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, - 0, NULL, depth + 1); - } - - /* Follow the next-chain of the current node and optimize - away all the NOTHINGs from it. - */ - rck_elide_nothing(scan); - - /* The principal pseudo-switch. Cannot be a switch, since we look into - * several different things. */ - if ( OP(scan) == DEFINEP ) { - SSize_t minlen = 0; - SSize_t deltanext = 0; - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY); - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - scan = regnext(scan); - assert( OP(scan) == IFTHEN ); - DEBUG_PEEP("expect IFTHEN", scan, depth, flags); - - data_fake.last_closep= &fake_last_close; - data_fake.last_close_opp= &fake_last_close_op; - minlen = *minlenp; - next = regnext(scan); - scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN); - DEBUG_PEEP("scan", scan, depth, flags); - DEBUG_PEEP("next", next, depth, flags); - - /* we suppose the run is continuous, last=next... - * NOTE we dont use the return here! */ - /* DEFINEP study_chunk() recursion */ - (void)study_chunk(pRExC_state, &scan, &minlen, - &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, mutate_ok); - - scan = next; - } else - if ( - OP(scan) == BRANCH || - OP(scan) == BRANCHJ || - OP(scan) == IFTHEN - ) { - next = regnext(scan); - code = OP(scan); - - /* The op(next)==code check below is to see if we - * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" - * IFTHEN is special as it might not appear in pairs. - * Not sure whether BRANCH-BRANCHJ is possible, regardless - * we dont handle it cleanly. */ - if (OP(next) == code || code == IFTHEN) { - /* NOTE - There is similar code to this block below for - * handling TRIE nodes on a re-study. If you change stuff here - * check there too. */ - SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0; - regnode_ssc accum; - regnode * const startbranch=scan; - - if (flags & SCF_DO_SUBSTR) { - /* Cannot merge strings after this. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - } - - if (flags & SCF_DO_STCLASS) - ssc_init_zero(pRExC_state, &accum); - - while (OP(scan) == code) { - SSize_t deltanext, minnext, fake_last_close = 0; - regnode *fake_last_close_op = NULL; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - regnode_ssc this_class; - - DEBUG_PEEP("Branch", scan, depth, flags); - - num++; - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - - data_fake.pos_delta = delta; - next = regnext(scan); - - scan = REGNODE_AFTER_opcode(scan, code); - - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - data_fake.start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - - /* we suppose the run is continuous, last=next...*/ - /* recurse study_chunk() for each BRANCH in an alternation */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, next, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, - mutate_ok); - - if (min1 > minnext) - min1 = minnext; - if (deltanext == OPTIMIZE_INFTY) { - is_inf = is_inf_internal = 1; - max1 = OPTIMIZE_INFTY; - } else if (max1 < minnext + deltanext) - max1 = minnext + deltanext; - scan = next; - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > minnext) - stopmin = min + min1; - flags &= ~SCF_DO_SUBSTR; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - } - if (data) { - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); - DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta); - } - if (code == IFTHEN && num < 2) /* Empty ELSE branch */ - min1 = 0; - if (flags & SCF_DO_SUBSTR) { - data->pos_min += min1; - if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1)) - data->pos_delta = OPTIMIZE_INFTY; - else - data->pos_delta += max1 - min1; - if (max1 != min1 || is_inf) - data->cur_is_floating = 1; - } - min += min1; - if (delta == OPTIMIZE_INFTY - || OPTIMIZE_INFTY - delta - (max1 - min1) < 0) - delta = OPTIMIZE_INFTY; - else - delta += max1 - min1; - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (flags & SCF_DO_STCLASS_AND) { - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); - flags &= ~SCF_DO_STCLASS; - } - else { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - } - } - DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta); - - if (PERL_ENABLE_TRIE_OPTIMISATION - && OP(startbranch) == BRANCH - && mutate_ok - ) { - /* demq. - - Assuming this was/is a branch we are dealing with: 'scan' - now points at the item that follows the branch sequence, - whatever it is. We now start at the beginning of the - sequence and look for subsequences of - - BRANCH->EXACT=>x1 - BRANCH->EXACT=>x2 - tail - - which would be constructed from a pattern like - /A|LIST|OF|WORDS/ - - If we can find such a subsequence we need to turn the first - element into a trie and then add the subsequent branch exact - strings to the trie. - - We have two cases - - 1. patterns where the whole set of branches can be - converted. - - 2. patterns where only a subset can be converted. - - In case 1 we can replace the whole set with a single regop - for the trie. In case 2 we need to keep the start and end - branches so - - 'BRANCH EXACT; BRANCH EXACT; BRANCH X' - becomes BRANCH TRIE; BRANCH X; - - There is an additional case, that being where there is a - common prefix, which gets split out into an EXACT like node - preceding the TRIE node. - - If X(1..n)==tail then we can do a simple trie, if not we make - a "jump" trie, such that when we match the appropriate word - we "jump" to the appropriate tail node. Essentially we turn - a nested if into a case structure of sorts. - - */ - - int made=0; - if (!re_trie_maxbuff) { - re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); - if (!SvIOK(re_trie_maxbuff)) - sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); - } - if ( SvIV(re_trie_maxbuff)>=0 ) { - regnode *cur; - regnode *first = (regnode *)NULL; - regnode *prev = (regnode *)NULL; - regnode *tail = scan; - U8 trietype = 0; - U32 count=0; - - /* var tail is used because there may be a TAIL - regop in the way. Ie, the exacts will point to the - thing following the TAIL, but the last branch will - point at the TAIL. So we advance tail. If we - have nested (?:) we may have to move through several - tails. - */ - - while ( OP( tail ) == TAIL ) { - /* this is the TAIL generated by (?:) */ - tail = regnext( tail ); - } - - - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", - depth+1, - "Looking for TRIE'able sequences. Tail node is ", - (UV) REGNODE_OFFSET(tail), - SvPV_nolen_const( RExC_mysv ) - ); - }); - - /* - - Step through the branches - cur represents each branch, - noper is the first thing to be matched as part - of that branch - noper_next is the regnext() of that node. - - We normally handle a case like this - /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also - support building with NOJUMPTRIE, which restricts - the trie logic to structures like /FOO|BAR/. - - If noper is a trieable nodetype then the branch is - a possible optimization target. If we are building - under NOJUMPTRIE then we require that noper_next is - the same as scan (our current position in the regex - program). - - Once we have two or more consecutive such branches - we can create a trie of the EXACT's contents and - stitch it in place into the program. - - If the sequence represents all of the branches in - the alternation we replace the entire thing with a - single TRIE node. - - Otherwise when it is a subsequence we need to - stitch it in place and replace only the relevant - branches. This means the first branch has to remain - as it is used by the alternation logic, and its - next pointer, and needs to be repointed at the item - on the branch chain following the last branch we - have optimized away. - - This could be either a BRANCH, in which case the - subsequence is internal, or it could be the item - following the branch sequence in which case the - subsequence is at the end (which does not - necessarily mean the first node is the start of the - alternation). - - TRIE_TYPE(X) is a define which maps the optype to a - trietype. - - optype | trietype - ----------------+----------- - NOTHING | NOTHING - EXACT | EXACT - EXACT_REQ8 | EXACT - EXACTFU | EXACTFU - EXACTFU_REQ8 | EXACTFU - EXACTFUP | EXACTFU - EXACTFAA | EXACTFAA - EXACTL | EXACTL - EXACTFLU8 | EXACTFLU8 - - - */ -#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ - ? NOTHING \ - : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ - ? EXACT \ - : ( EXACTFU == (X) \ - || EXACTFU_REQ8 == (X) \ - || EXACTFUP == (X) ) \ - ? EXACTFU \ - : ( EXACTFAA == (X) ) \ - ? EXACTFAA \ - : ( EXACTL == (X) ) \ - ? EXACTL \ - : ( EXACTFLU8 == (X) ) \ - ? EXACTFLU8 \ - : 0 ) - - /* dont use tail as the end marker for this traverse */ - for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { - regnode * const noper = REGNODE_AFTER( cur ); - U8 noper_type = OP( noper ); - U8 noper_trietype = TRIE_TYPE( noper_type ); -#if defined(DEBUGGING) || defined(NOJUMPTRIE) - regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; -#endif - - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %d:%s (%d)", - depth+1, - REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); - - regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); - Perl_re_printf( aTHX_ " -> %d:%s", - REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); - - if ( noper_next ) { - regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); - Perl_re_printf( aTHX_ "\t=> %d:%s\t", - REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); - } - Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), - REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype) - ); - }); - - /* Is noper a trieable nodetype that can be merged - * with the current trie (if there is one)? */ - if ( noper_trietype - && - ( - ( noper_trietype == NOTHING ) - || ( trietype == NOTHING ) - || ( trietype == noper_trietype ) - ) -#ifdef NOJUMPTRIE - && noper_next >= tail -#endif - && count < U16_MAX) - { - /* Handle mergable triable node Either we are - * the first node in a new trieable sequence, - * in which case we do some bookkeeping, - * otherwise we update the end pointer. */ - if ( !first ) { - first = cur; - if ( noper_trietype == NOTHING ) { -#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) - regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; -#endif - - if ( noper_next_trietype ) { - trietype = noper_next_trietype; - } else if (noper_next_type) { - /* a NOTHING regop is 1 regop wide. - * We need at least two for a trie - * so we can't merge this in */ - first = NULL; - } - } else { - trietype = noper_trietype; - } - } else { - if ( trietype == NOTHING ) - trietype = noper_trietype; - prev = cur; - } - if (first) - count++; - } /* end handle mergable triable node */ - else { - /* handle unmergable node - - * noper may either be a triable node which can - * not be tried together with the current trie, - * or a non triable node */ - if ( prev ) { - /* If last is set and trietype is not - * NOTHING then we have found at least two - * triable branch sequences in a row of a - * similar trietype so we can turn them - * into a trie. If/when we allow NOTHING to - * start a trie sequence this condition - * will be required, and it isn't expensive - * so we leave it in for now. */ - if ( trietype && trietype != NOTHING ) - make_trie( pRExC_state, - startbranch, first, cur, tail, - count, trietype, depth+1 ); - prev = NULL; /* note: we clear/update - first, trietype etc below, - so we dont do it here */ - } - if ( noper_trietype -#ifdef NOJUMPTRIE - && noper_next >= tail -#endif - ){ - /* noper is triable, so we can start a new - * trie sequence */ - count = 1; - first = cur; - trietype = noper_trietype; - } else if (first) { - /* if we already saw a first but the - * current node is not triable then we have - * to reset the first information. */ - count = 0; - first = NULL; - trietype = 0; - } - } /* end handle unmergable node */ - } /* loop over branches */ - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", - depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); - Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", - REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), - REGNODE_NAME(trietype) - ); - - }); - if ( prev && trietype ) { - if ( trietype != NOTHING ) { - /* the last branch of the sequence was part of - * a trie, so we have to construct it here - * outside of the loop */ - made= make_trie( pRExC_state, startbranch, - first, scan, tail, count, - trietype, depth+1 ); -#ifdef TRIE_STUDY_OPT - if ( ((made == MADE_EXACT_TRIE && - startbranch == first) - || ( first_non_open == first )) && - depth==0 ) { - flags |= SCF_TRIE_RESTUDY; - if ( startbranch == first - && scan >= tail ) - { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; - } - } -#endif - } else { - /* at this point we know whatever we have is a - * NOTHING sequence/branch AND if 'startbranch' - * is 'first' then we can turn the whole thing - * into a NOTHING - */ - if ( startbranch == first ) { - regnode *opt; - /* the entire thing is a NOTHING sequence, - * something like this: (?:|) So we can - * turn it into a plain NOTHING op. */ - DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", - depth+1, - SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); - - }); - OP(startbranch)= NOTHING; - NEXT_OFF(startbranch)= tail - startbranch; - for ( opt= startbranch + 1; opt < tail ; opt++ ) - OP(opt)= OPTIMIZED; - } - } - } /* end if ( prev) */ - } /* TRIE_MAXBUF is non zero */ - } /* do trie */ - DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta); - } - else - scan = REGNODE_AFTER_opcode(scan,code); - continue; - } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { - I32 paren = 0; - regnode *start = NULL; - regnode *end = NULL; - U32 my_recursed_depth= recursed_depth; - - if (OP(scan) != SUSPEND) { /* GOSUB */ - /* Do setup, note this code has side effects beyond - * the rest of this block. Specifically setting - * RExC_recurse[] must happen at least once during - * study_chunk(). */ - paren = ARG(scan); - RExC_recurse[ARG2L(scan)] = scan; - start = REGNODE_p(RExC_open_parens[paren]); - end = REGNODE_p(RExC_close_parens[paren]); - - /* NOTE we MUST always execute the above code, even - * if we do nothing with a GOSUB */ - if ( - ( flags & SCF_IN_DEFINE ) - || - ( - (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) - && - ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) - ) - ) { - /* no need to do anything here if we are in a define. */ - /* or we are after some kind of infinite construct - * so we can skip recursing into this item. - * Since it is infinite we will not change the maxlen - * or delta, and if we miss something that might raise - * the minlen it will merely pessimise a little. - * - * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ - * might result in a minlen of 1 and not of 4, - * but this doesn't make us mismatch, just try a bit - * harder than we should. - * - * However we must assume this GOSUB is infinite, to - * avoid wrongly applying other optimizations in the - * enclosing scope - see GH 18096, for example. - */ - is_inf = is_inf_internal = 1; - scan= regnext(scan); - continue; - } - - if ( - !recursed_depth - || !PAREN_TEST(recursed_depth - 1, paren) - ) { - /* it is quite possible that there are more efficient ways - * to do this. We maintain a bitmap per level of recursion - * of which patterns we have entered so we can detect if a - * pattern creates a possible infinite loop. When we - * recurse down a level we copy the previous levels bitmap - * down. When we are at recursion level 0 we zero the top - * level bitmap. It would be nice to implement a different - * more efficient way of doing this. In particular the top - * level bitmap may be unnecessary. - */ - if (!recursed_depth) { - Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); - } else { - Copy(PAREN_OFFSET(recursed_depth - 1), - PAREN_OFFSET(recursed_depth), - RExC_study_chunk_recursed_bytes, U8); - } - /* we havent recursed into this paren yet, so recurse into it */ - DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta); - PAREN_SET(recursed_depth, paren); - my_recursed_depth= recursed_depth + 1; - } else { - DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta); - /* some form of infinite recursion, assume infinite length - * */ - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_anything(data->start_class); - flags &= ~SCF_DO_STCLASS; - - start= NULL; /* reset start so we dont recurse later on. */ - } - } else { - paren = stopparen; - start = scan + 2; - end = regnext(scan); - } - if (start) { - scan_frame *newframe; - assert(end); - if (!RExC_frame_last) { - Newxz(newframe, 1, scan_frame); - SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); - RExC_frame_head= newframe; - RExC_frame_count++; - } else if (!RExC_frame_last->next_frame) { - Newxz(newframe, 1, scan_frame); - RExC_frame_last->next_frame= newframe; - newframe->prev_frame= RExC_frame_last; - RExC_frame_count++; - } else { - newframe= RExC_frame_last->next_frame; - } - RExC_frame_last= newframe; - - newframe->next_regnode = regnext(scan); - newframe->last_regnode = last; - newframe->stopparen = stopparen; - newframe->prev_recursed_depth = recursed_depth; - newframe->this_prev_frame= frame; - newframe->in_gosub = ( - (frame && frame->in_gosub) || OP(scan) == GOSUB - ); - - DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("fnew", scan, depth, flags); - - frame = newframe; - scan = start; - stopparen = paren; - last = end; - depth = depth + 1; - recursed_depth= my_recursed_depth; - - continue; - } - } - else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) { - SSize_t bytelen = STR_LEN(scan), charlen; - UV uc; - assert(bytelen); - if (UTF) { - const U8 * const s = (U8*)STRING(scan); - uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); - charlen = utf8_length(s, s + bytelen); - } else { - uc = *((U8*)STRING(scan)); - charlen = bytelen; - } - min += charlen; - if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ - /* The code below prefers earlier match for fixed - offset, later match for variable offset. */ - if (data->last_end == -1) { /* Update the start info. */ - data->last_start_min = data->pos_min; - data->last_start_max = - is_inf ? OPTIMIZE_INFTY - : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) - ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; - } - sv_catpvn(data->last_found, STRING(scan), bytelen); - if (UTF) - SvUTF8_on(data->last_found); - { - SV * const sv = data->last_found; - MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - if (mg && mg->mg_len >= 0) - mg->mg_len += charlen; - } - data->last_end = data->pos_min + charlen; - data->pos_min += charlen; /* As in the first entry. */ - data->flags &= ~SF_BEFORE_EOL; - } - - /* ANDing the code point leaves at most it, and not in locale, and - * can't match null string */ - if (flags & SCF_DO_STCLASS_AND) { - ssc_cp_and(data->start_class, uc); - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - ssc_clear_locale(data->start_class); - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_add_cp(data->start_class, uc); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta); - } - else if (REGNODE_TYPE(OP(scan)) == EXACT) { - /* But OP != EXACT!, so is EXACTFish */ - SSize_t bytelen = STR_LEN(scan), charlen; - const U8 * s = (U8*)STRING(scan); - - /* Replace a length 1 ASCII fold pair node with an ANYOFM node, - * with the mask set to the complement of the bit that differs - * between upper and lower case, and the lowest code point of the - * pair (which the '&' forces) */ - if ( bytelen == 1 - && isALPHA_A(*s) - && ( OP(scan) == EXACTFAA - || ( OP(scan) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) - && mutate_ok - ) { - U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ - - OP(scan) = ANYOFM; - ARG_SET(scan, *s & mask); - FLAGS(scan) = mask; - /* We're not EXACTFish any more, so restudy. - * Search for "restudy" in this file to find - * a comment with details. */ - continue; - } - - /* Search for fixed substrings supports EXACT only. */ - if (flags & SCF_DO_SUBSTR) { - assert(data); - scan_commit(pRExC_state, data, minlenp, is_inf); - } - charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; - if (unfolded_multi_char) { - RExC_seen |= REG_UNFOLDED_MULTI_SEEN; - } - min += charlen - min_subtract; - assert (min >= 0); - if ((SSize_t)min_subtract < OPTIMIZE_INFTY - && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract - ) { - delta += min_subtract; - } else { - delta = OPTIMIZE_INFTY; - } - if (flags & SCF_DO_SUBSTR) { - data->pos_min += charlen - min_subtract; - if (data->pos_min < 0) { - data->pos_min = 0; - } - if ((SSize_t)min_subtract < OPTIMIZE_INFTY - && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract - ) { - data->pos_delta += min_subtract; - } else { - data->pos_delta = OPTIMIZE_INFTY; - } - if (min_subtract) { - data->cur_is_floating = 1; /* float */ - } - } - - if (flags & SCF_DO_STCLASS) { - SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); - - assert(EXACTF_invlist); - if (flags & SCF_DO_STCLASS_AND) { - if (OP(scan) != EXACTFL) - ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - ANYOF_POSIXL_ZERO(data->start_class); - ssc_intersection(data->start_class, EXACTF_invlist, FALSE); - } - else { /* SCF_DO_STCLASS_OR */ - ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - SvREFCNT_dec(EXACTF_invlist); - } - DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta); - } - else if (REGNODE_VARIES(OP(scan))) { - SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; - I32 fl = 0; - U32 f = flags; - regnode * const oscan = scan; - regnode_ssc this_class; - regnode_ssc *oclass = NULL; - I32 next_is_eval = 0; - - switch (REGNODE_TYPE(OP(scan))) { - case WHILEM: /* End of (?:...)* . */ - scan = REGNODE_AFTER(scan); - goto finish; - case PLUS: - if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { - next = REGNODE_AFTER(scan); - if ( ( REGNODE_TYPE(OP(next)) == EXACT - && ! isEXACTFish(OP(next))) - || (flags & SCF_DO_STCLASS)) - { - mincount = 1; - maxcount = REG_INFTY; - next = regnext(scan); - scan = REGNODE_AFTER(scan); - goto do_curly; - } - } - if (flags & SCF_DO_SUBSTR) - data->pos_min++; - /* This will bypass the formal 'min += minnext * mincount' - * calculation in the do_curly path, so assumes min width - * of the PLUS payload is exactly one. */ - min++; - /* FALLTHROUGH */ - case STAR: - next = REGNODE_AFTER(scan); - - /* This temporary node can now be turned into EXACTFU, and - * must, as regexec.c doesn't handle it */ - if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { - OP(next) = EXACTFU; - } - - if ( STR_LEN(next) == 1 - && isALPHA_A(* STRING(next)) - && ( OP(next) == EXACTFAA - || ( OP(next) == EXACTFU - && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) - && mutate_ok - ) { - /* These differ in just one bit */ - U8 mask = ~ ('A' ^ 'a'); - - assert(isALPHA_A(* STRING(next))); - - /* Then replace it by an ANYOFM node, with - * the mask set to the complement of the - * bit that differs between upper and lower - * case, and the lowest code point of the - * pair (which the '&' forces) */ - OP(next) = ANYOFM; - ARG_SET(next, *STRING(next) & mask); - FLAGS(next) = mask; - } - - if (flags & SCF_DO_STCLASS) { - mincount = 0; - maxcount = REG_INFTY; - next = regnext(scan); - scan = REGNODE_AFTER(scan); - goto do_curly; - } - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - /* Cannot extend fixed substrings */ - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - scan = regnext(scan); - goto optimize_curly_tail; - case CURLY: - if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) - && (scan->flags == stopparen)) - { - mincount = 1; - maxcount = 1; - } else { - mincount = ARG1(scan); - maxcount = ARG2(scan); - } - next = regnext(scan); - if (OP(scan) == CURLYX) { - I32 lp = (data ? *(data->last_closep) : 0); - scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); - } - scan = REGNODE_AFTER(scan); - next_is_eval = (OP(scan) == EVAL); - do_curly: - if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) - scan_commit(pRExC_state, data, minlenp, is_inf); - /* Cannot extend fixed substrings */ - pos_before = data->pos_min; - } - if (data) { - fl = data->flags; - data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); - if (is_inf) - data->flags |= SF_IS_INF; - } - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - oclass = data->start_class; - data->start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - f &= ~SCF_DO_STCLASS_OR; - } - /* Exclude from super-linear cache processing any {n,m} - regops for which the combination of input pos and regex - pos is not enough information to determine if a match - will be possible. - - For example, in the regex /foo(bar\s*){4,8}baz/ with the - regex pos at the \s*, the prospects for a match depend not - only on the input position but also on how many (bar\s*) - repeats into the {4,8} we are. */ - if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) - f &= ~SCF_WHILEM_VISITED_POS; - - /* This will finish on WHILEM, setting scan, or on NULL: */ - /* recurse study_chunk() on loop bodies */ - minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, - last, data, stopparen, recursed_depth, NULL, - (mincount == 0 - ? (f & ~SCF_DO_SUBSTR) - : f) - , depth+1, mutate_ok); - - if (data && data->flags & SCF_SEEN_ACCEPT) { - if (mincount > 1) - mincount = 1; - } - - if (flags & SCF_DO_STCLASS) - data->start_class = oclass; - if (mincount == 0 || minnext == 0) { - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - } - else if (flags & SCF_DO_STCLASS_AND) { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&this_class, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - ANYOF_FLAGS(data->start_class) - |= SSC_MATCHES_EMPTY_STRING; - } - } else { /* Non-zero len */ - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - } - else if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); - flags &= ~SCF_DO_STCLASS; - } - if (!scan) /* It was not CURLYX, but CURLY. */ - scan = next; - if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) - /* ? quantifier ok, except for (?{ ... }) */ - && (next_is_eval || !(mincount == 0 && maxcount == 1)) - && (minnext == 0) && (deltanext == 0) - && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) - && maxcount <= REG_INFTY/3) /* Complement check for big - count */ - { - _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), - "Quantifier unexpected on zero-length expression " - "in regex m/%" UTF8f "/", - UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, - RExC_precomp))); - } - - if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) - || min >= SSize_t_MAX - minnext * mincount ) - { - FAIL("Regexp out of space"); - } - - min += minnext * mincount; - is_inf_internal |= deltanext == OPTIMIZE_INFTY - || (maxcount == REG_INFTY && minnext + deltanext > 0); - is_inf |= is_inf_internal; - if (is_inf) { - delta = OPTIMIZE_INFTY; - } else { - delta += (minnext + deltanext) * maxcount - - minnext * mincount; - } - - if (data && data->flags & SCF_SEEN_ACCEPT) { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - if (stopmin > min) - stopmin = min; - DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta); - } - /* Try powerful optimization CURLYX => CURLYN. */ - if ( OP(oscan) == CURLYX && data - && data->flags & SF_IN_PAR - && !(data->flags & SF_HAS_EVAL) - && !deltanext && minnext == 1 - && mutate_ok - ) { - /* Try to optimize to CURLYN. */ - regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); - regnode * const nxt1 = nxt; -#ifdef DEBUGGING - regnode *nxt2; -#endif - - /* Skip open. */ - nxt = regnext(nxt); - if (!REGNODE_SIMPLE(OP(nxt)) - && !(REGNODE_TYPE(OP(nxt)) == EXACT - && STR_LEN(nxt) == 1)) - goto nogo; -#ifdef DEBUGGING - nxt2 = nxt; -#endif - nxt = regnext(nxt); - if (OP(nxt) != CLOSE) - goto nogo; - if (RExC_open_parens) { - - /*open->CURLYM*/ - RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); - - /*close->while*/ - RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2; - } - /* Now we know that nxt2 is the only contents: */ - oscan->flags = (U8)PARNO(nxt); - OP(oscan) = CURLYN; - OP(nxt1) = NOTHING; /* was OPEN. */ - -#ifdef DEBUGGING - OP(nxt1 + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ - NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ - OP(nxt) = OPTIMIZED; /* was CLOSE. */ - OP(nxt + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ -#endif - } - nogo: - - /* Try optimization CURLYX => CURLYM. */ - if ( OP(oscan) == CURLYX && data - && !(data->flags & SF_HAS_PAR) - && !(data->flags & SF_HAS_EVAL) - && !deltanext /* atom is fixed width */ - && minnext != 0 /* CURLYM can't handle zero width */ - /* Nor characters whose fold at run-time may be - * multi-character */ - && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) - && mutate_ok - ) { - /* XXXX How to optimize if data == 0? */ - /* Optimize to a simpler form. */ - regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */ - regnode *nxt2; - - OP(oscan) = CURLYM; - while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ - && (OP(nxt2) != WHILEM)) - nxt = nxt2; - OP(nxt2) = SUCCEED; /* Whas WHILEM */ - /* Need to optimize away parenths. */ - if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { - /* Set the parenth number. */ - /* note that we have changed the type of oscan to CURLYM here */ - regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/ - - oscan->flags = (U8)PARNO(nxt); - if (RExC_open_parens) { - /*open->CURLYM*/ - RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); - - /*close->NOTHING*/ - RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2) - + 1; - } - OP(nxt1) = OPTIMIZED; /* was OPEN. */ - OP(nxt) = OPTIMIZED; /* was CLOSE. */ - -#ifdef DEBUGGING - OP(nxt1 + 1) = OPTIMIZED; /* was count. */ - OP(nxt + 1) = OPTIMIZED; /* was count. */ - NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ - NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ -#endif -#if 0 - while ( nxt1 && (OP(nxt1) != WHILEM)) { - regnode *nnxt = regnext(nxt1); - if (nnxt == nxt) { - if (REGNODE_OFF_BY_ARG(OP(nxt1))) - ARG_SET(nxt1, nxt2 - nxt1); - else if (nxt2 - nxt1 < U16_MAX) - NEXT_OFF(nxt1) = nxt2 - nxt1; - else - OP(nxt) = NOTHING; /* Cannot beautify */ - } - nxt1 = nnxt; - } -#endif - /* Optimize again: */ - /* recurse study_chunk() on optimised CURLYX => CURLYM */ - study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, - NULL, stopparen, recursed_depth, NULL, 0, - depth+1, mutate_ok); - } - else - oscan->flags = 0; - } - else if ((OP(oscan) == CURLYX) - && (flags & SCF_WHILEM_VISITED_POS) - /* See the comment on a similar expression above. - However, this time it's not a subexpression - we care about, but the expression itself. */ - && (maxcount == REG_INFTY) - && data) { - /* This stays as CURLYX, we can put the count/of pair. */ - /* Find WHILEM (as in regexec.c) */ - regnode *nxt = oscan + NEXT_OFF(oscan); - - if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */ - nxt += ARG(nxt); - nxt = REGNODE_BEFORE(nxt); - if (nxt->flags & 0xf) { - /* we've already set whilem count on this node */ - } else if (++data->whilem_c < 16) { - assert(data->whilem_c <= RExC_whilem_seen); - nxt->flags = (U8)(data->whilem_c - | (RExC_whilem_seen << 4)); /* On WHILEM */ - } - } - if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (flags & SCF_DO_SUBSTR) { - SV *last_str = NULL; - STRLEN last_chrs = 0; - int counted = mincount != 0; - - if (data->last_end > 0 && mincount != 0) { /* Ends with a - string. */ - SSize_t b = pos_before >= data->last_start_min - ? pos_before : data->last_start_min; - STRLEN l; - const char * const s = SvPV_const(data->last_found, l); - SSize_t old = b - data->last_start_min; - assert(old >= 0); - - if (UTF) - old = utf8_hop_forward((U8*)s, old, - (U8 *) SvEND(data->last_found)) - - (U8*)s; - l -= old; - /* Get the added string: */ - last_str = newSVpvn_utf8(s + old, l, UTF); - last_chrs = UTF ? utf8_length((U8*)(s + old), - (U8*)(s + old + l)) : l; - if (deltanext == 0 && pos_before == b) { - /* What was added is a constant string */ - if (mincount > 1) { - - SvGROW(last_str, (mincount * l) + 1); - repeatcpy(SvPVX(last_str) + l, - SvPVX_const(last_str), l, - mincount - 1); - SvCUR_set(last_str, SvCUR(last_str) * mincount); - /* Add additional parts. */ - SvCUR_set(data->last_found, - SvCUR(data->last_found) - l); - sv_catsv(data->last_found, last_str); - { - SV * sv = data->last_found; - MAGIC *mg = - SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - if (mg && mg->mg_len >= 0) - mg->mg_len += last_chrs * (mincount-1); - } - last_chrs *= mincount; - data->last_end += l * (mincount - 1); - } - } else { - /* start offset must point into the last copy */ - data->last_start_min += minnext * (mincount - 1); - data->last_start_max = - is_inf - ? OPTIMIZE_INFTY - : data->last_start_max + - (maxcount - 1) * (minnext + data->pos_delta); - } - } - /* It is counted once already... */ - data->pos_min += minnext * (mincount - counted); -#if 0 - Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf - " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf - " maxcount=%" UVuf " mincount=%" UVuf - " data->pos_delta=%" UVuf "\n", - (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, - (UV)maxcount, (UV)mincount, (UV)data->pos_delta); - if (deltanext != OPTIMIZE_INFTY) - Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", - (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta)); -#endif - if (deltanext == OPTIMIZE_INFTY - || data->pos_delta == OPTIMIZE_INFTY - || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta) - data->pos_delta = OPTIMIZE_INFTY; - else - data->pos_delta += - counted * deltanext + - (minnext + deltanext) * maxcount - minnext * mincount; - if (mincount != maxcount) { - /* Cannot extend fixed substrings found inside - the group. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - if (mincount && last_str) { - SV * const sv = data->last_found; - MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? - mg_find(sv, PERL_MAGIC_utf8) : NULL; - - if (mg) - mg->mg_len = -1; - sv_setsv(sv, last_str); - data->last_end = data->pos_min; - data->last_start_min = data->pos_min - last_chrs; - data->last_start_max = is_inf - ? OPTIMIZE_INFTY - : data->pos_min + data->pos_delta - last_chrs; - } - data->cur_is_floating = 1; /* float */ - } - SvREFCNT_dec(last_str); - } - if (data && (fl & SF_HAS_EVAL)) - data->flags |= SF_HAS_EVAL; - optimize_curly_tail: - rck_elide_nothing(oscan); - continue; - - default: - Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", - OP(scan)); - case REF: - case CLUMP: - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) { - if (OP(scan) == CLUMP) { - /* Actually is any start char, but very few code points - * aren't start characters */ - ssc_match_all_cp(data->start_class); - } - else { - ssc_anything(data->start_class); - } - } - flags &= ~SCF_DO_STCLASS; - break; - } - } - else if (OP(scan) == LNBREAK) { - if (flags & SCF_DO_STCLASS) { - if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, - PL_XPosix_ptrs[CC_VERTSPACE_], FALSE); - ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) - &= ~SSC_MATCHES_EMPTY_STRING; - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, - PL_XPosix_ptrs[CC_VERTSPACE_], - FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg for - * 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) - &= ~SSC_MATCHES_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - } - min++; - if (delta != OPTIMIZE_INFTY) - delta++; /* Because of the 2 char string cr-lf */ - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min += 1; - if (data->pos_delta != OPTIMIZE_INFTY) { - data->pos_delta += 1; - } - data->cur_is_floating = 1; /* float */ - } - } - else if (REGNODE_SIMPLE(OP(scan))) { - - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min++; - } - min++; - if (flags & SCF_DO_STCLASS) { - bool invert = 0; - SV* my_invlist = NULL; - U8 namedclass; - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; - - /* Some of the logic below assumes that switching - locale on will only add false positives. */ - switch (OP(scan)) { - - default: -#ifdef DEBUGGING - Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", - OP(scan)); -#endif - case SANY: - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_match_all_cp(data->start_class); - break; - - case REG_ANY: - { - SV* REG_ANY_invlist = _new_invlist(2); - REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, - '\n'); - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, - REG_ANY_invlist, - TRUE /* TRUE => invert, hence all but \n - */ - ); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, - REG_ANY_invlist, - TRUE /* TRUE => invert */ - ); - ssc_clear_locale(data->start_class); - } - SvREFCNT_dec_NN(REG_ANY_invlist); - } - break; - - case ANYOFD: - case ANYOFL: - case ANYOFPOSIXL: - case ANYOFH: - case ANYOFHb: - case ANYOFHr: - case ANYOFHs: - case ANYOF: - if (flags & SCF_DO_STCLASS_AND) - ssc_and(pRExC_state, data->start_class, - (regnode_charclass *) scan); - else - ssc_or(pRExC_state, data->start_class, - (regnode_charclass *) scan); - break; - - case ANYOFHbbm: - { - SV* cp_list = get_ANYOFHbbm_contents(scan); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case NANYOFM: /* NANYOFM already contains the inversion of the - input ANYOF data, so, unlike things like - NPOSIXA, don't change 'invert' to TRUE */ - /* FALLTHROUGH */ - case ANYOFM: - { - SV* cp_list = get_ANYOFM_contents(scan); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case ANYOFR: - case ANYOFRb: - { - SV* cp_list = NULL; - - cp_list = _add_range_to_invlist(cp_list, - ANYOFRbase(scan), - ANYOFRbase(scan) + ANYOFRdelta(scan)); - - if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, cp_list, invert); - } - else if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, cp_list, invert); - } - - SvREFCNT_dec_NN(cp_list); - break; - } - - case NPOSIXL: - invert = 1; - /* FALLTHROUGH */ - - case POSIXL: - namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; - if (flags & SCF_DO_STCLASS_AND) { - bool was_there = cBOOL( - ANYOF_POSIXL_TEST(data->start_class, - namedclass)); - ANYOF_POSIXL_ZERO(data->start_class); - if (was_there) { /* Do an AND */ - ANYOF_POSIXL_SET(data->start_class, namedclass); - } - /* No individual code points can now match */ - data->start_class->invlist - = sv_2mortal(_new_invlist(0)); - } - else { - int complement = namedclass + ((invert) ? -1 : 1); - - assert(flags & SCF_DO_STCLASS_OR); - - /* If the complement of this class was already there, - * the result is that they match all code points, - * (\d + \D == everything). Remove the classes from - * future consideration. Locale is not relevant in - * this case */ - if (ANYOF_POSIXL_TEST(data->start_class, complement)) { - ssc_match_all_cp(data->start_class); - ANYOF_POSIXL_CLEAR(data->start_class, namedclass); - ANYOF_POSIXL_CLEAR(data->start_class, complement); - } - else { /* The usual case; just add this class to the - existing set */ - ANYOF_POSIXL_SET(data->start_class, namedclass); - } - } - break; - - case NPOSIXA: /* For these, we always know the exact set of - what's matched */ - invert = 1; - /* FALLTHROUGH */ - case POSIXA: - my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); - goto join_posix_and_ascii; - - case NPOSIXD: - case NPOSIXU: - invert = 1; - /* FALLTHROUGH */ - case POSIXD: - case POSIXU: - my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); - - /* NPOSIXD matches all upper Latin1 code points unless the - * target string being matched is UTF-8, which is - * unknowable until match time. Since we are going to - * invert, we want to get rid of all of them so that the - * inversion will match all */ - if (OP(scan) == NPOSIXD) { - _invlist_subtract(my_invlist, PL_UpperLatin1, - &my_invlist); - } - - join_posix_and_ascii: - - if (flags & SCF_DO_STCLASS_AND) { - ssc_intersection(data->start_class, my_invlist, invert); - ssc_clear_locale(data->start_class); - } - else { - assert(flags & SCF_DO_STCLASS_OR); - ssc_union(data->start_class, my_invlist, invert); - } - SvREFCNT_dec(my_invlist); - } - if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) { - data->flags |= (OP(scan) == MEOL - ? SF_BEFORE_MEOL - : SF_BEFORE_SEOL); - scan_commit(pRExC_state, data, minlenp, is_inf); - - } - else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ - /* Lookbehind, or need to calculate parens/evals/stclass: */ - && (scan->flags || data || (flags & SCF_DO_STCLASS)) - && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) - { - if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY - || OP(scan) == UNLESSM ) - { - /* Negative Lookahead/lookbehind - In this case we can't do fixed string optimisation. - */ - - bool is_positive = OP(scan) == IFMATCH ? 1 : 0; - SSize_t deltanext, minnext; - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - regnode *cur_last_close_op; - regnode *nscan; - regnode_ssc intrnl; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - - /* remember the last_close_op we saw so we can see if - * we are dealing with variable length lookbehind that - * contains capturing buffers, which are considered - * experimental */ - cur_last_close_op= *(data_fake.last_close_opp); - - data_fake.pos_delta = delta; - if ( flags & SCF_DO_STCLASS && !scan->flags - && OP(scan) == IFMATCH ) { /* Lookahead */ - ssc_init(pRExC_state, &intrnl); - data_fake.start_class = &intrnl; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - next = regnext(scan); - nscan = REGNODE_AFTER(scan); - - /* recurse study_chunk() for lookahead body */ - minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, - last, &data_fake, stopparen, - recursed_depth, NULL, f, depth+1, - mutate_ok); - - if (scan->flags) { - if ( deltanext < 0 - || deltanext > (I32) U8_MAX - || minnext > (I32)U8_MAX - || minnext + deltanext > (I32)U8_MAX) - { - FAIL2("Lookbehind longer than %" UVuf " not implemented", - (UV)U8_MAX); - } - - /* The 'next_off' field has been repurposed to count the - * additional starting positions to try beyond the initial - * one. (This leaves it at 0 for non-variable length - * matches to avoid breakage for those not using this - * extension) */ - if (deltanext) { - scan->next_off = deltanext; - if ( - /* See a CLOSE op inside this lookbehind? */ - cur_last_close_op != *(data_fake.last_close_opp) - /* and not doing restudy. see: restudied */ - && !(flags & SCF_TRIE_DOING_RESTUDY) - ) { - /* this is positive variable length lookbehind with - * capture buffers inside of it */ - ckWARNexperimental_with_arg(RExC_parse, - WARN_EXPERIMENTAL__VLB, - "Variable length %s lookbehind with capturing is experimental", - is_positive ? "positive" : "negative"); - } - } - scan->flags = (U8)minnext + deltanext; - } - if (data) { - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (f & SCF_DO_STCLASS_AND) { - if (flags & SCF_DO_STCLASS_OR) { - /* OR before, AND after: ideally we would recurse with - * data_fake to get the AND applied by study of the - * remainder of the pattern, and then derecurse; - * *** HACK *** for now just treat as "no information". - * See [perl #56690]. - */ - ssc_init(pRExC_state, data->start_class); - } else { - /* AND before and after: combine and continue. These - * assertions are zero-length, so can match an EMPTY - * string */ - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) - |= SSC_MATCHES_EMPTY_STRING; - } - } - DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta); - } -#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY - else { - /* Positive Lookahead/lookbehind - In this case we can do fixed string optimisation, - but we must be careful about it. Note in the case of - lookbehind the positions will be offset by the minimum - length of the pattern, something we won't know about - until after the recurse. - */ - SSize_t deltanext, fake_last_close = 0; - regnode *last_close_op = NULL; - regnode *nscan; - regnode_ssc intrnl; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - /* We use SAVEFREEPV so that when the full compile - is finished perl will clean up the allocated - minlens when it's all done. This way we don't - have to worry about freeing them when we know - they wont be used, which would be a pain. - */ - SSize_t *minnextp; - Newx( minnextp, 1, SSize_t ); - SAVEFREEPV(minnextp); - - if (data) { - StructCopy(data, &data_fake, scan_data_t); - if ((flags & SCF_DO_SUBSTR) && data->last_found) { - f |= SCF_DO_SUBSTR; - if (scan->flags) - scan_commit(pRExC_state, &data_fake, minlenp, is_inf); - data_fake.last_found=newSVsv(data->last_found); - } - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_opp; - } - data_fake.flags = 0; - data_fake.substrs[0].flags = 0; - data_fake.substrs[1].flags = 0; - data_fake.pos_delta = delta; - if (is_inf) - data_fake.flags |= SF_IS_INF; - if ( flags & SCF_DO_STCLASS && !scan->flags - && OP(scan) == IFMATCH ) { /* Lookahead */ - ssc_init(pRExC_state, &intrnl); - data_fake.start_class = &intrnl; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - next = regnext(scan); - nscan = REGNODE_AFTER(scan); - - /* positive lookahead study_chunk() recursion */ - *minnextp = study_chunk(pRExC_state, &nscan, minnextp, - &deltanext, last, &data_fake, - stopparen, recursed_depth, NULL, - f, depth+1, mutate_ok); - if (scan->flags) { - assert(0); /* This code has never been tested since this - is normally not compiled */ - if ( deltanext < 0 - || deltanext > (I32) U8_MAX - || *minnextp > (I32)U8_MAX - || *minnextp + deltanext > (I32)U8_MAX) - { - FAIL2("Lookbehind longer than %" UVuf " not implemented", - (UV)U8_MAX); - } - - if (deltanext) { - scan->next_off = deltanext; - } - scan->flags = (U8)*minnextp + deltanext; - } - - *minnextp += min; - - if (f & SCF_DO_STCLASS_AND) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; - } - if (data) { - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { - int i; - if (RExC_rx->minlen < *minnextp) - RExC_rx->minlen = *minnextp; - scan_commit(pRExC_state, &data_fake, minnextp, is_inf); - SvREFCNT_dec_NN(data_fake.last_found); - - for (i = 0; i < 2; i++) { - if (data_fake.substrs[i].minlenp != minlenp) { - data->substrs[i].min_offset = - data_fake.substrs[i].min_offset; - data->substrs[i].max_offset = - data_fake.substrs[i].max_offset; - data->substrs[i].minlenp = - data_fake.substrs[i].minlenp; - data->substrs[i].lookbehind += scan->flags; - } - } - } - } - } -#endif - } - else if (OP(scan) == OPEN) { - if (stopparen != (I32)PARNO(scan)) - pars++; - } - else if (OP(scan) == CLOSE) { - if (stopparen == (I32)PARNO(scan)) { - break; - } - if ((I32)PARNO(scan) == is_par) { - next = regnext(scan); - - if ( next && (OP(next) != WHILEM) && next < last) - is_par = 0; /* Disable optimization */ - } - if (data) { - *(data->last_closep) = PARNO(scan); - *(data->last_close_opp) = scan; - } - } - else if (OP(scan) == EVAL) { - if (data) - data->flags |= SF_HAS_EVAL; - } - else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - if (OP(scan)==ACCEPT) { - /* m{(*ACCEPT)x} does not have to start with 'x' */ - flags &= ~SCF_DO_STCLASS; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - if (stopmin > min) - stopmin = min; - } - } - else if (OP(scan) == COMMIT) { - /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we - * must not end up with "abcxyz" as a fixed substring else we'll - * skip straight to attempting to match at offset 4. - */ - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - flags &= ~SCF_DO_SUBSTR; - } - } - else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ - { - if (flags & SCF_DO_SUBSTR) { - scan_commit(pRExC_state, data, minlenp, is_inf); - data->cur_is_floating = 1; /* float */ - } - is_inf = is_inf_internal = 1; - if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - ssc_anything(data->start_class); - flags &= ~SCF_DO_STCLASS; - } - else if (OP(scan) == GPOS) { - if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && - !(delta || is_inf || (data && data->pos_delta))) - { - if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) - RExC_rx->intflags |= PREGf_ANCH_GPOS; - if (RExC_rx->gofs < (STRLEN)min) - RExC_rx->gofs = min; - } else { - RExC_rx->intflags |= PREGf_GPOS_FLOAT; - RExC_rx->gofs = 0; - } - } -#ifdef TRIE_STUDY_OPT -#ifdef FULL_TRIE_STUDY - else if (REGNODE_TYPE(OP(scan)) == TRIE) { - /* NOTE - There is similar code to this block above for handling - BRANCH nodes on the initial study. If you change stuff here - check there too. */ - regnode *trie_node= scan; - regnode *tail= regnext(scan); - reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - SSize_t max1 = 0, min1 = OPTIMIZE_INFTY; - regnode_ssc accum; - - if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ - /* Cannot merge strings after this. */ - scan_commit(pRExC_state, data, minlenp, is_inf); - } - if (flags & SCF_DO_STCLASS) - ssc_init_zero(pRExC_state, &accum); - - if (!trie->jump) { - min1= trie->minlen; - max1= trie->maxlen; - } else { - const regnode *nextbranch= NULL; - U32 word; - - for ( word=1 ; word <= trie->wordcount ; word++) - { - SSize_t deltanext = 0, minnext = 0; - U32 f = (flags & SCF_TRIE_DOING_RESTUDY); - SSize_t fake_last_close = 0; - regnode *fake_last_close_op = NULL; - regnode_ssc this_class; - - StructCopy(&zero_scan_data, &data_fake, scan_data_t); - if (data) { - data_fake.whilem_c = data->whilem_c; - data_fake.last_closep = data->last_closep; - data_fake.last_close_opp = data->last_close_opp; - } - else { - data_fake.last_closep = &fake_last_close; - data_fake.last_close_opp = &fake_last_close_op; - } - data_fake.pos_delta = delta; - if (flags & SCF_DO_STCLASS) { - ssc_init(pRExC_state, &this_class); - data_fake.start_class = &this_class; - f |= SCF_DO_STCLASS_AND; - } - if (flags & SCF_WHILEM_VISITED_POS) - f |= SCF_WHILEM_VISITED_POS; - - if (trie->jump[word]) { - if (!nextbranch) - nextbranch = trie_node + trie->jump[0]; - scan= trie_node + trie->jump[word]; - /* We go from the jump point to the branch that follows - it. Note this means we need the vestigal unused - branches even though they arent otherwise used. */ - /* optimise study_chunk() for TRIE */ - minnext = study_chunk(pRExC_state, &scan, minlenp, - &deltanext, (regnode *)nextbranch, &data_fake, - stopparen, recursed_depth, NULL, f, depth+1, - mutate_ok); - } - if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) - nextbranch= regnext((regnode*)nextbranch); - - if (min1 > (SSize_t)(minnext + trie->minlen)) - min1 = minnext + trie->minlen; - if (deltanext == OPTIMIZE_INFTY) { - is_inf = is_inf_internal = 1; - max1 = OPTIMIZE_INFTY; - } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) - max1 = minnext + deltanext + trie->maxlen; - - if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) - pars++; - if (data_fake.flags & SCF_SEEN_ACCEPT) { - if ( stopmin > min + min1) - stopmin = min + min1; - flags &= ~SCF_DO_SUBSTR; - if (data) - data->flags |= SCF_SEEN_ACCEPT; - } - if (data) { - if (data_fake.flags & SF_HAS_EVAL) - data->flags |= SF_HAS_EVAL; - data->whilem_c = data_fake.whilem_c; - } - if (flags & SCF_DO_STCLASS) - ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); - } - DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta); - } - if (flags & SCF_DO_SUBSTR) { - data->pos_min += min1; - data->pos_delta += max1 - min1; - if (max1 != min1 || is_inf) - data->cur_is_floating = 1; /* float */ - } - min += min1; - if (delta != OPTIMIZE_INFTY) { - if (OPTIMIZE_INFTY - (max1 - min1) >= delta) - delta += max1 - min1; - else - delta = OPTIMIZE_INFTY; - } - if (flags & SCF_DO_STCLASS_OR) { - ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - flags &= ~SCF_DO_STCLASS; - } - } - else if (flags & SCF_DO_STCLASS_AND) { - if (min1) { - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); - flags &= ~SCF_DO_STCLASS; - } - else { - /* Switch to OR mode: cache the old value of - * data->start_class */ - INIT_AND_WITHP; - StructCopy(data->start_class, and_withp, regnode_ssc); - flags &= ~SCF_DO_STCLASS_AND; - StructCopy(&accum, data->start_class, regnode_ssc); - flags |= SCF_DO_STCLASS_OR; - } - } - scan= tail; - DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta); - continue; - } -#else - else if (REGNODE_TYPE(OP(scan)) == TRIE) { - reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; - U8*bang=NULL; - - min += trie->minlen; - delta += (trie->maxlen - trie->minlen); - flags &= ~SCF_DO_STCLASS; /* xxx */ - if (flags & SCF_DO_SUBSTR) { - /* Cannot expect anything... */ - scan_commit(pRExC_state, data, minlenp, is_inf); - data->pos_min += trie->minlen; - data->pos_delta += (trie->maxlen - trie->minlen); - if (trie->maxlen != trie->minlen) - data->cur_is_floating = 1; /* float */ - } - if (trie->jump) /* no more substrings -- for now /grr*/ - flags &= ~SCF_DO_SUBSTR; - } - -#endif /* old or new */ -#endif /* TRIE_STUDY_OPT */ - - else if (OP(scan) == REGEX_SET) { - Perl_croak(aTHX_ "panic: %s regnode should be resolved" - " before optimization", REGNODE_NAME(REGEX_SET)); - } - - /* Else: zero-length, ignore. */ - scan = regnext(scan); - } - - finish: - if (frame) { - /* we need to unwind recursion. */ - depth = depth - 1; - - DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta); - DEBUG_PEEP("fend", scan, depth, flags); - - /* restore previous context */ - last = frame->last_regnode; - scan = frame->next_regnode; - stopparen = frame->stopparen; - recursed_depth = frame->prev_recursed_depth; - - RExC_frame_last = frame->prev_frame; - frame = frame->this_prev_frame; - goto fake_study_recurse; - } - - assert(!frame); - DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta); - - /* is this pattern infinite? Eg, consider /(a|b+)/ */ - if (is_inf_internal) - delta = OPTIMIZE_INFTY; - - /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */ - if (min > stopmin) { - /* - At this point 'min' represents the minimum length string we can - match while *ignoring* the implication of ACCEPT, and 'delta' - represents the difference between the minimum length and maximum - length, and if the pattern matches an infinitely long string - (consider the + and * quantifiers) then we use the special delta - value of OPTIMIZE_INFTY to represent it. 'stopmin' is the - minimum length that can be matched *and* accepted. - - A pattern is accepted when matching was successful *and* - complete, and thus there is no further matching needing to be - done, no backtracking to occur, etc. Prior to the introduction - of ACCEPT the only opcode that signaled acceptance was the END - opcode, which is always the very last opcode in a regex program. - ACCEPT is thus conceptually an early successful return out of - the matching process. stopmin starts out as OPTIMIZE_INFTY to - represent "the entire pattern", and is ratched down to the - "current min" if necessary when an ACCEPT opcode is encountered. - - Thus stopmin might be smaller than min if we saw an (*ACCEPT), - and we now need to account for it in both min and delta. - Consider that in a pattern /AB/ normally the min length it can - match can be computed as min(A)+min(B). But (*ACCEPT) means - that it might be something else, not even neccesarily min(A) at - all. Consider - - A = /(foo(*ACCEPT)|x+)/ - B = /whop/ - AB = /(foo(*ACCEPT)|x+)whop/ - - The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY - for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for - "whop", and the delta of 0 as the pattern is of fixed length, the - stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT. - When handling AB we expect to see a min of 5 for "xwhop", and a - delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3 - for "foo". This should result in a final min of 3 for "foo", and - a final delta of OPTIMIZE_INFTY for "xxxxx...whop". - - In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a - min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the - stop min would be 4 for "dude". This should result in a final - min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx". - - When min is smaller than stopmin then we can ignore it. In the - fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2, - and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously - the ACCEPT doesn't reduce the minimum length of the string that - might be matched, nor affect the maximum length. - - In something like /foo(*ACCEPT)ba?r/ we would have a min of 5 - for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for - "foo". We currently turn this into a min of 3 for "foo" and a - delta of 3 for "foobar" even though technically "foobar" isn't - possible. ACCEPT affects some aspects of the optimizer, like - length computations and mandatory substring optimizations, but - there are other optimzations this routine perfoms that are not - affected and this compromise simplifies implementation. - - It might be helpful to consider that this C function is called - recursively on the pattern in a bottom up fashion, and that the - min returned by a nested call may be marked as coming from an - ACCEPT, causing its callers to treat the returned min as a - stopmin as the recursion unwinds. Thus a single ACCEPT can affect - multiple calls into this function in different ways. - */ - - if (OPTIMIZE_INFTY - delta >= min - stopmin) - delta += min - stopmin; - else - delta = OPTIMIZE_INFTY; - min = stopmin; - } - - *scanp = scan; - *deltap = delta; - - if (flags & SCF_DO_SUBSTR && is_inf) - data->pos_delta = OPTIMIZE_INFTY - data->pos_min; - if (is_par > (I32)U8_MAX) - is_par = 0; - if (is_par && pars==1 && data) { - data->flags |= SF_IN_PAR; - data->flags &= ~SF_HAS_PAR; - } - else if (pars && data) { - data->flags |= SF_HAS_PAR; - data->flags &= ~SF_IN_PAR; - } - if (flags & SCF_DO_STCLASS_OR) - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - if (flags & SCF_TRIE_RESTUDY) - data->flags |= SCF_TRIE_RESTUDY; - - - if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { - if (min > OPTIMIZE_INFTY - delta) - RExC_maxlen = OPTIMIZE_INFTY; - else if (RExC_maxlen < min + delta) - RExC_maxlen = min + delta; - } - DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta); - return min; -} - /* add a data member to the struct reg_data attached to this regex, it should * always return a non-zero return. the 's' argument is the type of the items * being added and the n is the number of items. The length of 's' should match * the number of items. */ -STATIC U32 -S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) +U32 +Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) { U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1; - PERL_ARGS_ASSERT_ADD_DATA; + PERL_ARGS_ASSERT_REG_ADD_DATA; /* in the below expression we have (count + n - 1), the minus one is there * because the struct that we allocate already contains a slot for 1 data @@ -6958,9 +320,9 @@ S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) /* when count == 1 it means we have not initialized anything. * we always fill the 0 slot of the data array with a '%' entry, which * means "zero" (all the other types are letters) which exists purely - * so the return from add_data is ALWAYS true, so we can tell it apart + * so the return from reg_add_data is ALWAYS true, so we can tell it apart * from a "no value" idx=0 in places where we would return an index - * into add_data. This is particularly important with the new "single + * into reg_add_data. This is particularly important with the new "single * pass, usually, but not always" strategy that we use, where the code * will use a 0 to represent "not able to compute this yet". */ @@ -7435,9 +797,9 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, assert(n < pRExC_state->code_blocks->count); src = &ri->code_blocks->cb[i]; dst = &pRExC_state->code_blocks->cb[n]; - dst->start = src->start + offset; - dst->end = src->end + offset; - dst->block = src->block; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) src->src_regex ? src->src_regex @@ -7689,9 +1051,9 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, assert(pat[src->start] == '('); assert(pat[src->end] == ')'); - dst->start = src->start; - dst->end = src->end; - dst->block = src->block; + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) : src->src_regex; dst++; @@ -8236,7 +1598,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_rx->intflags = 0; - RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ RExC_parse_set(exp); /* This NUL is guaranteed because the pattern comes from an SV*, and the sv @@ -8423,11 +1785,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ if (UTF) - SvUTF8_on(Rx); /* Unicode in it? */ + SvUTF8_on(Rx); /* Unicode in it? */ RExC_rxi->regstclass = NULL; - if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ + if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ RExC_rx->intflags |= PREGf_NAUGHTY; - scan = RExC_rxi->program + 1; /* First BRANCH. */ + scan = RExC_rxi->program + 1; /* First BRANCH. */ /* testing for BRANCH here tells us whether there is "must appear" data in the pattern. If there is then we can use it for optimisations */ @@ -8487,7 +1849,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Ignore EXACT as we deal with it later. */ if (REGNODE_TYPE(OP(first)) == EXACT) { if (! isEXACTFish(OP(first))) { - NOOP; /* Empty, get anchored substr later. */ + NOOP; /* Empty, get anchored substr later. */ } else RExC_rxi->regstclass = first; @@ -8581,7 +1943,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, ssc_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; - } else /* XXXX Check for BOUND? */ + } else /* XXXX Check for BOUND? */ stclass_flag = 0; data.last_closep = &last_close; data.last_close_opp = &last_close_op; @@ -8661,7 +2023,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && is_ssc_worth_it(pRExC_state, data.start_class)) { - const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); ssc_finalize(pRExC_state, data.start_class); @@ -8670,7 +2032,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, (regnode_ssc*)RExC_rxi->data->data[n], regnode_ssc); RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; - RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); Perl_re_printf( aTHX_ @@ -8745,7 +2107,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) && is_ssc_worth_it(pRExC_state, data.start_class)) { - const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); + const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); ssc_finalize(pRExC_state, data.start_class); @@ -8754,7 +2116,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, (regnode_ssc*)RExC_rxi->data->data[n], regnode_ssc); RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; - RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ + RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); Perl_re_printf( aTHX_ @@ -8860,7 +2222,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #ifdef DEBUGGING if (RExC_paren_names) { - RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a")); + RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a")); RExC_rxi->data->data[RExC_rxi->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else @@ -8916,426 +2278,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } -SV* -Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, - const U32 flags) -{ - PERL_ARGS_ASSERT_REG_NAMED_BUFF; - - PERL_UNUSED_ARG(value); - - if (flags & RXapif_FETCH) { - return reg_named_buff_fetch(rx, key, flags); - } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak_no_modify(); - return NULL; - } else if (flags & RXapif_EXISTS) { - return reg_named_buff_exists(rx, key, flags) - ? &PL_sv_yes - : &PL_sv_no; - } else if (flags & RXapif_REGNAMES) { - return reg_named_buff_all(rx, flags); - } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { - return reg_named_buff_scalar(rx, flags); - } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); - return NULL; - } -} - -SV* -Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, - const U32 flags) -{ - PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; - PERL_UNUSED_ARG(lastkey); - - if (flags & RXapif_FIRSTKEY) - return reg_named_buff_firstkey(rx, flags); - else if (flags & RXapif_NEXTKEY) - return reg_named_buff_nextkey(rx, flags); - else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", - (int)flags); - return NULL; - } -} - -SV* -Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, - const U32 flags) -{ - SV *ret; - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; - - if (rx && RXp_PAREN_NAMES(rx)) { - HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL; - for ( i=0; i<SvIVX(sv_dat); i++ ) { - if ((I32)(rx->nparens) >= nums[i] - && rx->offs[nums[i]].start != -1 - && rx->offs[nums[i]].end != -1) - { - ret = newSVpvs(""); - CALLREG_NUMBUF_FETCH(r, nums[i], ret); - if (!retarray) - return ret; - } else { - if (retarray) - ret = newSV_type(SVt_NULL); - } - if (retarray) - av_push_simple(retarray, ret); - } - if (retarray) - return newRV_noinc(MUTABLE_SV(retarray)); - } - } - return NULL; -} - -bool -Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, - const U32 flags) -{ - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; - - if (rx && RXp_PAREN_NAMES(rx)) { - if (flags & RXapif_ALL) { - return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); - } else { - SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); - if (sv) { - SvREFCNT_dec_NN(sv); - return TRUE; - } else { - return FALSE; - } - } - } else { - return FALSE; - } -} - -SV* -Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; - - if ( rx && RXp_PAREN_NAMES(rx) ) { - (void)hv_iterinit(RXp_PAREN_NAMES(rx)); - - return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); - } else { - return FALSE; - } -} - -SV* -Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; - - if (rx && RXp_PAREN_NAMES(rx)) { - HV *hv = RXp_PAREN_NAMES(rx); - HE *temphe; - while ( (temphe = hv_iternext_flags(hv, 0)) ) { - IV i; - IV parno = 0; - SV* sv_dat = HeVAL(temphe); - I32 *nums = (I32*)SvPVX(sv_dat); - for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastparen) >= nums[i] && - rx->offs[nums[i]].start != -1 && - rx->offs[nums[i]].end != -1) - { - parno = nums[i]; - break; - } - } - if (parno || flags & RXapif_ALL) { - return newSVhek(HeKEY_hek(temphe)); - } - } - } - return NULL; -} - -SV* -Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) -{ - SV *ret; - AV *av; - SSize_t length; - struct regexp *const rx = ReANY(r); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; - - if (rx && RXp_PAREN_NAMES(rx)) { - if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { - return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); - } else if (flags & RXapif_ONE) { - ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); - av = MUTABLE_AV(SvRV(ret)); - length = av_count(av); - SvREFCNT_dec_NN(ret); - return newSViv(length); - } else { - Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", - (int)flags); - return NULL; - } - } - return &PL_sv_undef; -} - -SV* -Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) -{ - struct regexp *const rx = ReANY(r); - AV *av = newAV(); - - PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; - - if (rx && RXp_PAREN_NAMES(rx)) { - HV *hv= RXp_PAREN_NAMES(rx); - HE *temphe; - (void)hv_iterinit(hv); - while ( (temphe = hv_iternext_flags(hv, 0)) ) { - IV i; - IV parno = 0; - SV* sv_dat = HeVAL(temphe); - I32 *nums = (I32*)SvPVX(sv_dat); - for ( i = 0; i < SvIVX(sv_dat); i++ ) { - if ((I32)(rx->lastparen) >= nums[i] && - rx->offs[nums[i]].start != -1 && - rx->offs[nums[i]].end != -1) - { - parno = nums[i]; - break; - } - } - if (parno || flags & RXapif_ALL) { - av_push_simple(av, newSVhek(HeKEY_hek(temphe))); - } - } - } - - return newRV_noinc(MUTABLE_SV(av)); -} - -void -Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, - SV * const sv) -{ - struct regexp *const rx = ReANY(r); - char *s = NULL; - SSize_t i = 0; - SSize_t s1, t1; - I32 n = paren; - - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - - if ( n == RX_BUFF_IDX_CARET_PREMATCH - || n == RX_BUFF_IDX_CARET_FULLMATCH - || n == RX_BUFF_IDX_CARET_POSTMATCH - ) - { - bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); - if (!keepcopy) { - /* on something like - * $r = qr/.../; - * /$qr/p; - * the KEEPCOPY is set on the PMOP rather than the regex */ - if (PL_curpm && r == PM_GETRE(PL_curpm)) - keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); - } - if (!keepcopy) - goto ret_undef; - } - - if (!rx->subbeg) - goto ret_undef; - - if (n == RX_BUFF_IDX_CARET_FULLMATCH) - /* no need to distinguish between them any more */ - n = RX_BUFF_IDX_FULLMATCH; - - if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) - && rx->offs[0].start != -1) - { - /* $`, ${^PREMATCH} */ - i = rx->offs[0].start; - s = rx->subbeg; - } - else - if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) - && rx->offs[0].end != -1) - { - /* $', ${^POSTMATCH} */ - s = rx->subbeg - rx->suboffset + rx->offs[0].end; - i = rx->sublen + rx->suboffset - rx->offs[0].end; - } - else - if (inRANGE(n, 0, (I32)rx->nparens) && - (s1 = rx->offs[n].start) != -1 && - (t1 = rx->offs[n].end) != -1) - { - /* $&, ${^MATCH}, $1 ... */ - i = t1 - s1; - s = rx->subbeg + s1 - rx->suboffset; - } else { - goto ret_undef; - } - - assert(s >= rx->subbeg); - assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); - if (i >= 0) { -#ifdef NO_TAINT_SUPPORT - sv_setpvn(sv, s, i); -#else - const int oldtainted = TAINT_get; - TAINT_NOT; - sv_setpvn(sv, s, i); - TAINT_set(oldtainted); -#endif - if (RXp_MATCH_UTF8(rx)) - SvUTF8_on(sv); - else - SvUTF8_off(sv); - if (TAINTING_get) { - if (RXp_MATCH_TAINTED(rx)) { - if (SvTYPE(sv) >= SVt_PVMG) { - MAGIC* const mg = SvMAGIC(sv); - MAGIC* mgt; - TAINT; - SvMAGIC_set(sv, mg->mg_moremagic); - SvTAINT(sv); - if ((mgt = SvMAGIC(sv))) { - mg->mg_moremagic = mgt; - SvMAGIC_set(sv, mg); - } - } else { - TAINT; - SvTAINT(sv); - } - } else - SvTAINTED_off(sv); - } - } else { - ret_undef: - sv_set_undef(sv); - return; - } -} - -void -Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, - SV const * const value) -{ - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; - - PERL_UNUSED_ARG(rx); - PERL_UNUSED_ARG(paren); - PERL_UNUSED_ARG(value); - - if (!PL_localizing) - Perl_croak_no_modify(); -} - -I32 -Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, - const I32 paren) -{ - struct regexp *const rx = ReANY(r); - I32 i; - I32 s1, t1; - - PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; - - if ( paren == RX_BUFF_IDX_CARET_PREMATCH - || paren == RX_BUFF_IDX_CARET_FULLMATCH - || paren == RX_BUFF_IDX_CARET_POSTMATCH - ) - { - bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); - if (!keepcopy) { - /* on something like - * $r = qr/.../; - * /$qr/p; - * the KEEPCOPY is set on the PMOP rather than the regex */ - if (PL_curpm && r == PM_GETRE(PL_curpm)) - keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); - } - if (!keepcopy) - goto warn_undef; - } - - /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ - switch (paren) { - case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ - case RX_BUFF_IDX_PREMATCH: /* $` */ - if (rx->offs[0].start != -1) { - i = rx->offs[0].start; - if (i > 0) { - s1 = 0; - t1 = i; - goto getlen; - } - } - return 0; - - case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ - case RX_BUFF_IDX_POSTMATCH: /* $' */ - if (rx->offs[0].end != -1) { - i = rx->sublen - rx->offs[0].end; - if (i > 0) { - s1 = rx->offs[0].end; - t1 = rx->sublen; - goto getlen; - } - } - return 0; - - default: /* $& / ${^MATCH}, $1, $2, ... */ - if (paren <= (I32)rx->nparens && - (s1 = rx->offs[paren].start) != -1 && - (t1 = rx->offs[paren].end) != -1) - { - i = t1 - s1; - goto getlen; - } else { - warn_undef: - if (ckWARN(WARN_UNINITIALIZED)) - report_uninit((const SV *)sv); - return 0; - } - } - getlen: - if (i > 0 && RXp_MATCH_UTF8(rx)) { - const char * const s = rx->subbeg - rx->suboffset + s1; - const U8 *ep; - STRLEN el; - - i = t1 - s1; - if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) - i = el; - } - return i; -} SV* Perl_reg_qr_package(pTHX_ REGEXP * const rx) @@ -9458,1730 +2400,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_re_printf( aTHX_ fmt "\n",args); \ }) -/* This section of code defines the inversion list object and its methods. The - * interfaces are highly subject to change, so as much as possible is static to - * this file. An inversion list is here implemented as a malloc'd C UV array - * as an SVt_INVLIST scalar. - * - * An inversion list for Unicode is an array of code points, sorted by ordinal - * number. Each element gives the code point that begins a range that extends - * up-to but not including the code point given by the next element. The final - * element gives the first code point of a range that extends to the platform's - * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], - * ...) give ranges whose code points are all in the inversion list. We say - * that those ranges are in the set. The odd-numbered elements give ranges - * whose code points are not in the inversion list, and hence not in the set. - * Thus, element [0] is the first code point in the list. Element [1] - * is the first code point beyond that not in the list; and element [2] is the - * first code point beyond that that is in the list. In other words, the first - * range is invlist[0]..(invlist[1]-1), and all code points in that range are - * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and - * all code points in that range are not in the inversion list. The third - * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion - * list, and so forth. Thus every element whose index is divisible by two - * gives the beginning of a range that is in the list, and every element whose - * index is not divisible by two gives the beginning of a range not in the - * list. If the final element's index is divisible by two, the inversion list - * extends to the platform's infinity; otherwise the highest code point in the - * inversion list is the contents of that element minus 1. - * - * A range that contains just a single code point N will look like - * invlist[i] == N - * invlist[i+1] == N+1 - * - * If N is UV_MAX (the highest representable code point on the machine), N+1 is - * impossible to represent, so element [i+1] is omitted. The single element - * inversion list - * invlist[0] == UV_MAX - * contains just UV_MAX, but is interpreted as matching to infinity. - * - * Taking the complement (inverting) an inversion list is quite simple, if the - * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element at the beginning of each inversion - * list to always contain 0; there is an additional flag in the header which - * indicates if the list begins at the 0, or is offset to begin at the next - * element. This means that the inversion list can be inverted without any - * copying; just flip the flag. - * - * More about inversion lists can be found in "Unicode Demystified" - * Chapter 13 by Richard Gillam, published by Addison-Wesley. - * - * The inversion list data structure is currently implemented as an SV pointing - * to an array of UVs that the SV thinks are bytes. This allows us to have an - * array of UV whose memory management is automatically handled by the existing - * facilities for SV's. - * - * Some of the methods should always be private to the implementation, and some - * should eventually be made public */ - -/* The header definitions are in F<invlist_inline.h> */ - -#ifndef PERL_IN_XSUB_RE - -PERL_STATIC_INLINE UV* -S__invlist_array_init(SV* const invlist, const bool will_have_0) -{ - /* Returns a pointer to the first element in the inversion list's array. - * This is called upon initialization of an inversion list. Where the - * array begins depends on whether the list has the code point U+0000 in it - * or not. The other parameter tells it whether the code that follows this - * call is about to put a 0 in the inversion list or not. The first - * element is either the element reserved for 0, if TRUE, or the element - * after it, if FALSE */ - - bool* offset = get_invlist_offset_addr(invlist); - UV* zero_addr = (UV *) SvPVX(invlist); - - PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; - - /* Must be empty */ - assert(! _invlist_len(invlist)); - - *zero_addr = 0; - - /* 1^1 = 0; 1^0 = 1 */ - *offset = 1 ^ will_have_0; - return zero_addr + *offset; -} - -STATIC void -S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) -{ - /* Replaces the inversion list in 'dest' with the one from 'src'. It - * steals the list from 'src', so 'src' is made to have a NULL list. This - * is similar to what SvSetMagicSV() would do, if it were implemented on - * inversion lists, though this routine avoids a copy */ - - const UV src_len = _invlist_len(src); - const bool src_offset = *get_invlist_offset_addr(src); - const STRLEN src_byte_len = SvLEN(src); - char * array = SvPVX(src); - -#ifndef NO_TAINT_SUPPORT - const int oldtainted = TAINT_get; -#endif - - PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; - - assert(is_invlist(src)); - assert(is_invlist(dest)); - assert(! invlist_is_iterating(src)); - assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); - - /* Make sure it ends in the right place with a NUL, as our inversion list - * manipulations aren't careful to keep this true, but sv_usepvn_flags() - * asserts it */ - array[src_byte_len - 1] = '\0'; - - TAINT_NOT; /* Otherwise it breaks */ - sv_usepvn_flags(dest, - (char *) array, - src_byte_len - 1, - - /* This flag is documented to cause a copy to be avoided */ - SV_HAS_TRAILING_NUL); - TAINT_set(oldtainted); - SvPV_set(src, 0); - SvLEN_set(src, 0); - SvCUR_set(src, 0); - - /* Finish up copying over the other fields in an inversion list */ - *get_invlist_offset_addr(dest) = src_offset; - invlist_set_len(dest, src_len, src_offset); - *get_invlist_previous_index_addr(dest) = 0; - invlist_iterfinish(dest); -} - -PERL_STATIC_INLINE IV* -S_get_invlist_previous_index_addr(SV* invlist) -{ - /* Return the address of the IV that is reserved to hold the cached index - * */ - PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; - - assert(is_invlist(invlist)); - - return &(((XINVLIST*) SvANY(invlist))->prev_index); -} - -PERL_STATIC_INLINE IV -S_invlist_previous_index(SV* const invlist) -{ - /* Returns cached index of previous search */ - - PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; - - return *get_invlist_previous_index_addr(invlist); -} - -PERL_STATIC_INLINE void -S_invlist_set_previous_index(SV* const invlist, const IV index) -{ - /* Caches <index> for later retrieval */ - - PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; - - assert(index == 0 || index < (int) _invlist_len(invlist)); - - *get_invlist_previous_index_addr(invlist) = index; -} - -PERL_STATIC_INLINE void -S_invlist_trim(SV* invlist) -{ - /* Free the not currently-being-used space in an inversion list */ - - /* But don't free up the space needed for the 0 UV that is always at the - * beginning of the list, nor the trailing NUL */ - const UV min_size = TO_INTERNAL_SIZE(1) + 1; - - PERL_ARGS_ASSERT_INVLIST_TRIM; - - assert(is_invlist(invlist)); - - SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); -} - -PERL_STATIC_INLINE void -S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ -{ - PERL_ARGS_ASSERT_INVLIST_CLEAR; - - assert(is_invlist(invlist)); - - invlist_set_len(invlist, 0, 0); - invlist_trim(invlist); -} - -#endif /* ifndef PERL_IN_XSUB_RE */ - -PERL_STATIC_INLINE bool -S_invlist_is_iterating(const SV* const invlist) -{ - PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - - /* get_invlist_iter_addr()'s sv is non-const only because it returns a - * value that can be used to modify the invlist, it doesn't modify the - * invlist itself */ - return *(get_invlist_iter_addr((SV*)invlist)) < (STRLEN) UV_MAX; -} - -#ifndef PERL_IN_XSUB_RE - -PERL_STATIC_INLINE UV -S_invlist_max(const SV* const invlist) -{ - /* Returns the maximum number of elements storable in the inversion list's - * array, without having to realloc() */ - - PERL_ARGS_ASSERT_INVLIST_MAX; - - assert(is_invlist(invlist)); - - /* Assumes worst case, in which the 0 element is not counted in the - * inversion list, so subtracts 1 for that */ - return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ - ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 - : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; -} - -STATIC void -S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) -{ - PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS; - - /* First 1 is in case the zero element isn't in the list; second 1 is for - * trailing NUL */ - SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1); - invlist_set_len(invlist, 0, 0); - - /* Force iterinit() to be used to get iteration to work */ - invlist_iterfinish(invlist); - - *get_invlist_previous_index_addr(invlist) = 0; - SvPOK_on(invlist); /* This allows B to extract the PV */ -} - -SV* -Perl__new_invlist(pTHX_ IV initial_size) -{ - - /* Return a pointer to a newly constructed inversion list, with enough - * space to store 'initial_size' elements. If that number is negative, a - * system default is used instead */ - - SV* new_list; - - if (initial_size < 0) { - initial_size = 10; - } - - new_list = newSV_type(SVt_INVLIST); - initialize_invlist_guts(new_list, initial_size); - - return new_list; -} - -SV* -Perl__new_invlist_C_array(pTHX_ const UV* const list) -{ - /* Return a pointer to a newly constructed inversion list, initialized to - * point to <list>, which has to be in the exact correct inversion list - * form, including internal fields. Thus this is a dangerous routine that - * should not be used in the wrong hands. The passed in 'list' contains - * several header fields at the beginning that are not part of the - * inversion list body proper */ - - const STRLEN length = (STRLEN) list[0]; - const UV version_id = list[1]; - const bool offset = cBOOL(list[2]); -#define HEADER_LENGTH 3 - /* If any of the above changes in any way, you must change HEADER_LENGTH - * (if appropriate) and regenerate INVLIST_VERSION_ID by running - * perl -E 'say int(rand 2**31-1)' - */ -#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and - data structure type, so that one being - passed in can be validated to be an - inversion list of the correct vintage. - */ - - SV* invlist = newSV_type(SVt_INVLIST); - - PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - - if (version_id != INVLIST_VERSION_ID) { - Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); - } - - /* The generated array passed in includes header elements that aren't part - * of the list proper, so start it just after them */ - SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); - - SvLEN_set(invlist, 0); /* Means we own the contents, and the system - shouldn't touch it */ - - *(get_invlist_offset_addr(invlist)) = offset; - - /* The 'length' passed to us is the physical number of elements in the - * inversion list. But if there is an offset the logical number is one - * less than that */ - invlist_set_len(invlist, length - offset, offset); - - invlist_set_previous_index(invlist, 0); - - /* Initialize the iteration pointer. */ - invlist_iterfinish(invlist); - - SvREADONLY_on(invlist); - SvPOK_on(invlist); - - return invlist; -} - -STATIC void -S__append_range_to_invlist(pTHX_ SV* const invlist, - const UV start, const UV end) -{ - /* Subject to change or removal. Append the range from 'start' to 'end' at - * the end of the inversion list. The range must be above any existing - * ones. */ - - UV* array; - UV max = invlist_max(invlist); - UV len = _invlist_len(invlist); - bool offset; - - PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; - - if (len == 0) { /* Empty lists must be initialized */ - offset = start != 0; - array = _invlist_array_init(invlist, ! offset); - } - else { - /* Here, the existing list is non-empty. The current max entry in the - * list is generally the first value not in the set, except when the - * set extends to the end of permissible values, in which case it is - * the first entry in that final set, and so this call is an attempt to - * append out-of-order */ - - UV final_element = len - 1; - array = invlist_array(invlist); - if ( array[final_element] > start - || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) - { - Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", - array[final_element], start, - ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); - } - - /* Here, it is a legal append. If the new range begins 1 above the end - * of the range below it, it is extending the range below it, so the - * new first value not in the set is one greater than the newly - * extended range. */ - offset = *get_invlist_offset_addr(invlist); - if (array[final_element] == start) { - if (end != UV_MAX) { - array[final_element] = end + 1; - } - else { - /* But if the end is the maximum representable on the machine, - * assume that infinity was actually what was meant. Just let - * the range that this would extend to have no end */ - invlist_set_len(invlist, len - 1, offset); - } - return; - } - } - - /* Here the new range doesn't extend any existing set. Add it */ - - len += 2; /* Includes an element each for the start and end of range */ - - /* If wll overflow the existing space, extend, which may cause the array to - * be moved */ - if (max < len) { - invlist_extend(invlist, len); - - /* Have to set len here to avoid assert failure in invlist_array() */ - invlist_set_len(invlist, len, offset); - - array = invlist_array(invlist); - } - else { - invlist_set_len(invlist, len, offset); - } - - /* The next item on the list starts the range, the one after that is - * one past the new range. */ - array[len - 2] = start; - if (end != UV_MAX) { - array[len - 1] = end + 1; - } - else { - /* But if the end is the maximum representable on the machine, just let - * the range have no end */ - invlist_set_len(invlist, len - 1, offset); - } -} - -SSize_t -Perl__invlist_search(SV* const invlist, const UV cp) -{ - /* Searches the inversion list for the entry that contains the input code - * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the - * return value is the index into the list's array of the range that - * contains <cp>, that is, 'i' such that - * array[i] <= cp < array[i+1] - */ - - IV low = 0; - IV mid; - IV high = _invlist_len(invlist); - const IV highest_element = high - 1; - const UV* array; - - PERL_ARGS_ASSERT__INVLIST_SEARCH; - - /* If list is empty, return failure. */ - if (UNLIKELY(high == 0)) { - return -1; - } - - /* (We can't get the array unless we know the list is non-empty) */ - array = invlist_array(invlist); - - mid = invlist_previous_index(invlist); - assert(mid >=0); - if (UNLIKELY(mid > highest_element)) { - mid = highest_element; - } - - /* <mid> contains the cache of the result of the previous call to this - * function (0 the first time). See if this call is for the same result, - * or if it is for mid-1. This is under the theory that calls to this - * function will often be for related code points that are near each other. - * And benchmarks show that caching gives better results. We also test - * here if the code point is within the bounds of the list. These tests - * replace others that would have had to be made anyway to make sure that - * the array bounds were not exceeded, and these give us extra information - * at the same time */ - if (cp >= array[mid]) { - if (cp >= array[highest_element]) { - return highest_element; - } - - /* Here, array[mid] <= cp < array[highest_element]. This means that - * the final element is not the answer, so can exclude it; it also - * means that <mid> is not the final element, so can refer to 'mid + 1' - * safely */ - if (cp < array[mid + 1]) { - return mid; - } - high--; - low = mid + 1; - } - else { /* cp < aray[mid] */ - if (cp < array[0]) { /* Fail if outside the array */ - return -1; - } - high = mid; - if (cp >= array[mid - 1]) { - goto found_entry; - } - } - - /* Binary search. What we are looking for is <i> such that - * array[i] <= cp < array[i+1] - * The loop below converges on the i+1. Note that there may not be an - * (i+1)th element in the array, and things work nonetheless */ - while (low < high) { - mid = (low + high) / 2; - assert(mid <= highest_element); - if (array[mid] <= cp) { /* cp >= array[mid] */ - low = mid + 1; - - /* We could do this extra test to exit the loop early. - if (cp < array[low]) { - return mid; - } - */ - } - else { /* cp < array[mid] */ - high = mid; - } - } - - found_entry: - high--; - invlist_set_previous_index(invlist, high); - return high; -} - -void -Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, - const bool complement_b, SV** output) -{ - /* Take the union of two inversion lists and point '*output' to it. On - * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly - * even 'a' or 'b'). If to an inversion list, the contents of the original - * list will be replaced by the union. The first list, 'a', may be - * NULL, in which case a copy of the second list is placed in '*output'. - * If 'complement_b' is TRUE, the union is taken of the complement - * (inversion) of 'b' instead of b itself. - * - * The basis for this comes from "Unicode Demystified" Chapter 13 by - * Richard Gillam, published by Addison-Wesley, and explained at some - * length there. The preface says to incorporate its examples into your - * code at your own risk. - * - * The algorithm is like a merge sort. */ - - const UV* array_a; /* a's array */ - const UV* array_b; - UV len_a; /* length of a's array */ - UV len_b; - - SV* u; /* the resulting union */ - UV* array_u; - UV len_u = 0; - - UV i_a = 0; /* current index into a's array */ - UV i_b = 0; - UV i_u = 0; - - /* running count, as explained in the algorithm source book; items are - * stopped accumulating and are output when the count changes to/from 0. - * The count is incremented when we start a range that's in an input's set, - * and decremented when we start a range that's not in a set. So this - * variable can be 0, 1, or 2. When it is 0 neither input is in their set, - * and hence nothing goes into the union; 1, just one of the inputs is in - * its set (and its current range gets added to the union); and 2 when both - * inputs are in their sets. */ - UV count = 0; - - PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; - assert(a != b); - assert(*output == NULL || is_invlist(*output)); - - len_b = _invlist_len(b); - if (len_b == 0) { - - /* Here, 'b' is empty, hence it's complement is all possible code - * points. So if the union includes the complement of 'b', it includes - * everything, and we need not even look at 'a'. It's easiest to - * create a new inversion list that matches everything. */ - if (complement_b) { - SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); - - if (*output == NULL) { /* If the output didn't exist, just point it - at the new list */ - *output = everything; - } - else { /* Otherwise, replace its contents with the new list */ - invlist_replace_list_destroys_src(*output, everything); - SvREFCNT_dec_NN(everything); - } - - return; - } - - /* Here, we don't want the complement of 'b', and since 'b' is empty, - * the union will come entirely from 'a'. If 'a' is NULL or empty, the - * output will be empty */ - - if (a == NULL || _invlist_len(a) == 0) { - if (*output == NULL) { - *output = _new_invlist(0); - } - else { - invlist_clear(*output); - } - return; - } - - /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the - * union. We can just return a copy of 'a' if '*output' doesn't point - * to an existing list */ - if (*output == NULL) { - *output = invlist_clone(a, NULL); - return; - } - - /* If the output is to overwrite 'a', we have a no-op, as it's - * already in 'a' */ - if (*output == a) { - return; - } - - /* Here, '*output' is to be overwritten by 'a' */ - u = invlist_clone(a, NULL); - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - - return; - } - - /* Here 'b' is not empty. See about 'a' */ - - if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { - - /* Here, 'a' is empty (and b is not). That means the union will come - * entirely from 'b'. If '*output' is NULL, we can directly return a - * clone of 'b'. Otherwise, we replace the contents of '*output' with - * the clone */ - - SV ** dest = (*output == NULL) ? output : &u; - *dest = invlist_clone(b, NULL); - if (complement_b) { - _invlist_invert(*dest); - } - - if (dest == &u) { - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - } - - return; - } - - /* Here both lists exist and are non-empty */ - array_a = invlist_array(a); - array_b = invlist_array(b); - - /* If are to take the union of 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later */ - if (array_b[0] == 0) { - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - /* Size the union for the worst case: that the sets are completely - * disjoint */ - u = _new_invlist(len_a + len_b); - - /* Will contain U+0000 if either component does */ - array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) - || (len_b > 0 && array_b[0] == 0)); - - /* Go through each input list item by item, stopping when have exhausted - * one of them */ - while (i_a < len_a && i_b < len_b) { - UV cp; /* The element to potentially add to the union's array */ - bool cp_in_set; /* is it in the input list's set or not */ - - /* We need to take one or the other of the two inputs for the union. - * Since we are merging two sorted lists, we take the smaller of the - * next items. In case of a tie, we take first the one that is in its - * set. If we first took the one not in its set, it would decrement - * the count, possibly to 0 which would cause it to be output as ending - * the range, and the next time through we would take the same number, - * and output it again as beginning the next range. By doing it the - * opposite way, there is no possibility that the count will be - * momentarily decremented to 0, and thus the two adjoining ranges will - * be seamlessly merged. (In a tie and both are in the set or both not - * in the set, it doesn't matter which we take first.) */ - if ( array_a[i_a] < array_b[i_b] - || ( array_a[i_a] == array_b[i_b] - && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) - { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp = array_a[i_a++]; - } - else { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); - cp = array_b[i_b++]; - } - - /* Here, have chosen which of the two inputs to look at. Only output - * if the running count changes to/from 0, which marks the - * beginning/end of a range that's in the set */ - if (cp_in_set) { - if (count == 0) { - array_u[i_u++] = cp; - } - count++; - } - else { - count--; - if (count == 0) { - array_u[i_u++] = cp; - } - } - } - - - /* The loop above increments the index into exactly one of the input lists - * each iteration, and ends when either index gets to its list end. That - * means the other index is lower than its end, and so something is - * remaining in that one. We decrement 'count', as explained below, if - * that list is in its set. (i_a and i_b each currently index the element - * beyond the one we care about.) */ - if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) - { - count--; - } - - /* Above we decremented 'count' if the list that had unexamined elements in - * it was in its set. This has made it so that 'count' being non-zero - * means there isn't anything left to output; and 'count' equal to 0 means - * that what is left to output is precisely that which is left in the - * non-exhausted input list. - * - * To see why, note first that the exhausted input obviously has nothing - * left to add to the union. If it was in its set at its end, that means - * the set extends from here to the platform's infinity, and hence so does - * the union and the non-exhausted set is irrelevant. The exhausted set - * also contributed 1 to 'count'. If 'count' was 2, it got decremented to - * 1, but if it was 1, the non-exhausted set wasn't in its set, and so - * 'count' remains at 1. This is consistent with the decremented 'count' - * != 0 meaning there's nothing left to add to the union. - * - * But if the exhausted input wasn't in its set, it contributed 0 to - * 'count', and the rest of the union will be whatever the other input is. - * If 'count' was 0, neither list was in its set, and 'count' remains 0; - * otherwise it gets decremented to 0. This is consistent with 'count' - * == 0 meaning the remainder of the union is whatever is left in the - * non-exhausted list. */ - if (count != 0) { - len_u = i_u; - } - else { - IV copy_count = len_a - i_a; - if (copy_count > 0) { /* The non-exhausted input is 'a' */ - Copy(array_a + i_a, array_u + i_u, copy_count, UV); - } - else { /* The non-exhausted input is b */ - copy_count = len_b - i_b; - Copy(array_b + i_b, array_u + i_u, copy_count, UV); - } - len_u = i_u + copy_count; - } - - /* Set the result to the final length, which can change the pointer to - * array_u, so re-find it. (Note that it is unlikely that this will - * change, as we are shrinking the space, not enlarging it) */ - if (len_u != _invlist_len(u)) { - invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); - invlist_trim(u); - array_u = invlist_array(u); - } - - if (*output == NULL) { /* Simply return the new inversion list */ - *output = u; - } - else { - /* Otherwise, overwrite the inversion list that was in '*output'. We - * could instead free '*output', and then set it to 'u', but experience - * has shown [perl #127392] that if the input is a mortal, we can get a - * huge build-up of these during regex compilation before they get - * freed. */ - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - } - - return; -} - -void -Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, - const bool complement_b, SV** i) -{ - /* Take the intersection of two inversion lists and point '*i' to it. On - * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly - * even 'a' or 'b'). If to an inversion list, the contents of the original - * list will be replaced by the intersection. The first list, 'a', may be - * NULL, in which case '*i' will be an empty list. If 'complement_b' is - * TRUE, the result will be the intersection of 'a' and the complement (or - * inversion) of 'b' instead of 'b' directly. - * - * The basis for this comes from "Unicode Demystified" Chapter 13 by - * Richard Gillam, published by Addison-Wesley, and explained at some - * length there. The preface says to incorporate its examples into your - * code at your own risk. In fact, it had bugs - * - * The algorithm is like a merge sort, and is essentially the same as the - * union above - */ - - const UV* array_a; /* a's array */ - const UV* array_b; - UV len_a; /* length of a's array */ - UV len_b; - - SV* r; /* the resulting intersection */ - UV* array_r; - UV len_r = 0; - - UV i_a = 0; /* current index into a's array */ - UV i_b = 0; - UV i_r = 0; - - /* running count of how many of the two inputs are postitioned at ranges - * that are in their sets. As explained in the algorithm source book, - * items are stopped accumulating and are output when the count changes - * to/from 2. The count is incremented when we start a range that's in an - * input's set, and decremented when we start a range that's not in a set. - * Only when it is 2 are we in the intersection. */ - UV count = 0; - - PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; - assert(a != b); - assert(*i == NULL || is_invlist(*i)); - - /* Special case if either one is empty */ - len_a = (a == NULL) ? 0 : _invlist_len(a); - if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { - if (len_a != 0 && complement_b) { - - /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' - * must be empty. Here, also we are using 'b's complement, which - * hence must be every possible code point. Thus the intersection - * is simply 'a'. */ - - if (*i == a) { /* No-op */ - return; - } - - if (*i == NULL) { - *i = invlist_clone(a, NULL); - return; - } - - r = invlist_clone(a, NULL); - invlist_replace_list_destroys_src(*i, r); - SvREFCNT_dec_NN(r); - return; - } - - /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The - * intersection must be empty */ - if (*i == NULL) { - *i = _new_invlist(0); - return; - } - - invlist_clear(*i); - return; - } - - /* Here both lists exist and are non-empty */ - array_a = invlist_array(a); - array_b = invlist_array(b); - - /* If are to take the intersection of 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* To complement, we invert: if the first element is 0, remove it. To - * do this, we just pretend the array starts one later */ - if (array_b[0] == 0) { - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - /* Size the intersection for the worst case: that the intersection ends up - * fragmenting everything to be completely disjoint */ - r= _new_invlist(len_a + len_b); - - /* Will contain U+0000 iff both components do */ - array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 - && len_b > 0 && array_b[0] == 0); - - /* Go through each list item by item, stopping when have exhausted one of - * them */ - while (i_a < len_a && i_b < len_b) { - UV cp; /* The element to potentially add to the intersection's - array */ - bool cp_in_set; /* Is it in the input list's set or not */ - - /* We need to take one or the other of the two inputs for the - * intersection. Since we are merging two sorted lists, we take the - * smaller of the next items. In case of a tie, we take first the one - * that is not in its set (a difference from the union algorithm). If - * we first took the one in its set, it would increment the count, - * possibly to 2 which would cause it to be output as starting a range - * in the intersection, and the next time through we would take that - * same number, and output it again as ending the set. By doing the - * opposite of this, there is no possibility that the count will be - * momentarily incremented to 2. (In a tie and both are in the set or - * both not in the set, it doesn't matter which we take first.) */ - if ( array_a[i_a] < array_b[i_b] - || ( array_a[i_a] == array_b[i_b] - && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) - { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp = array_a[i_a++]; - } - else { - cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); - cp= array_b[i_b++]; - } - - /* Here, have chosen which of the two inputs to look at. Only output - * if the running count changes to/from 2, which marks the - * beginning/end of a range that's in the intersection */ - if (cp_in_set) { - count++; - if (count == 2) { - array_r[i_r++] = cp; - } - } - else { - if (count == 2) { - array_r[i_r++] = cp; - } - count--; - } - - } - - /* The loop above increments the index into exactly one of the input lists - * each iteration, and ends when either index gets to its list end. That - * means the other index is lower than its end, and so something is - * remaining in that one. We increment 'count', as explained below, if the - * exhausted list was in its set. (i_a and i_b each currently index the - * element beyond the one we care about.) */ - if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) - { - count++; - } - - /* Above we incremented 'count' if the exhausted list was in its set. This - * has made it so that 'count' being below 2 means there is nothing left to - * output; otheriwse what's left to add to the intersection is precisely - * that which is left in the non-exhausted input list. - * - * To see why, note first that the exhausted input obviously has nothing - * left to affect the intersection. If it was in its set at its end, that - * means the set extends from here to the platform's infinity, and hence - * anything in the non-exhausted's list will be in the intersection, and - * anything not in it won't be. Hence, the rest of the intersection is - * precisely what's in the non-exhausted list The exhausted set also - * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing - * it means 'count' is now at least 2. This is consistent with the - * incremented 'count' being >= 2 means to add the non-exhausted list to - * the intersection. - * - * But if the exhausted input wasn't in its set, it contributed 0 to - * 'count', and the intersection can't include anything further; the - * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get - * incremented. This is consistent with 'count' being < 2 meaning nothing - * further to add to the intersection. */ - if (count < 2) { /* Nothing left to put in the intersection. */ - len_r = i_r; - } - else { /* copy the non-exhausted list, unchanged. */ - IV copy_count = len_a - i_a; - if (copy_count > 0) { /* a is the one with stuff left */ - Copy(array_a + i_a, array_r + i_r, copy_count, UV); - } - else { /* b is the one with stuff left */ - copy_count = len_b - i_b; - Copy(array_b + i_b, array_r + i_r, copy_count, UV); - } - len_r = i_r + copy_count; - } - - /* Set the result to the final length, which can change the pointer to - * array_r, so re-find it. (Note that it is unlikely that this will - * change, as we are shrinking the space, not enlarging it) */ - if (len_r != _invlist_len(r)) { - invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); - invlist_trim(r); - array_r = invlist_array(r); - } - - if (*i == NULL) { /* Simply return the calculated intersection */ - *i = r; - } - else { /* Otherwise, replace the existing inversion list in '*i'. We could - instead free '*i', and then set it to 'r', but experience has - shown [perl #127392] that if the input is a mortal, we can get a - huge build-up of these during regex compilation before they get - freed. */ - if (len_r) { - invlist_replace_list_destroys_src(*i, r); - } - else { - invlist_clear(*i); - } - SvREFCNT_dec_NN(r); - } - - return; -} - -SV* -Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) -{ - /* Add the range from 'start' to 'end' inclusive to the inversion list's - * set. A pointer to the inversion list is returned. This may actually be - * a new list, in which case the passed in one has been destroyed. The - * passed-in inversion list can be NULL, in which case a new one is created - * with just the one range in it. The new list is not necessarily - * NUL-terminated. Space is not freed if the inversion list shrinks as a - * result of this function. The gain would not be large, and in many - * cases, this is called multiple times on a single inversion list, so - * anything freed may almost immediately be needed again. - * - * This used to mostly call the 'union' routine, but that is much more - * heavyweight than really needed for a single range addition */ - - UV* array; /* The array implementing the inversion list */ - UV len; /* How many elements in 'array' */ - SSize_t i_s; /* index into the invlist array where 'start' - should go */ - SSize_t i_e = 0; /* And the index where 'end' should go */ - UV cur_highest; /* The highest code point in the inversion list - upon entry to this function */ - - /* This range becomes the whole inversion list if none already existed */ - if (invlist == NULL) { - invlist = _new_invlist(2); - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Likewise, if the inversion list is currently empty */ - len = _invlist_len(invlist); - if (len == 0) { - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Starting here, we have to know the internals of the list */ - array = invlist_array(invlist); - - /* If the new range ends higher than the current highest ... */ - cur_highest = invlist_highest(invlist); - if (end > cur_highest) { - - /* If the whole range is higher, we can just append it */ - if (start > cur_highest) { - _append_range_to_invlist(invlist, start, end); - return invlist; - } - - /* Otherwise, add the portion that is higher ... */ - _append_range_to_invlist(invlist, cur_highest + 1, end); - - /* ... and continue on below to handle the rest. As a result of the - * above append, we know that the index of the end of the range is the - * final even numbered one of the array. Recall that the final element - * always starts a range that extends to infinity. If that range is in - * the set (meaning the set goes from here to infinity), it will be an - * even index, but if it isn't in the set, it's odd, and the final - * range in the set is one less, which is even. */ - if (end == UV_MAX) { - i_e = len; - } - else { - i_e = len - 2; - } - } - - /* We have dealt with appending, now see about prepending. If the new - * range starts lower than the current lowest ... */ - if (start < array[0]) { - - /* Adding something which has 0 in it is somewhat tricky, and uncommon. - * Let the union code handle it, rather than having to know the - * trickiness in two code places. */ - if (UNLIKELY(start == 0)) { - SV* range_invlist; - - range_invlist = _new_invlist(2); - _append_range_to_invlist(range_invlist, start, end); - - _invlist_union(invlist, range_invlist, &invlist); - - SvREFCNT_dec_NN(range_invlist); - - return invlist; - } - - /* If the whole new range comes before the first entry, and doesn't - * extend it, we have to insert it as an additional range */ - if (end < array[0] - 1) { - i_s = i_e = -1; - goto splice_in_new_range; - } - - /* Here the new range adjoins the existing first range, extending it - * downwards. */ - array[0] = start; - - /* And continue on below to handle the rest. We know that the index of - * the beginning of the range is the first one of the array */ - i_s = 0; - } - else { /* Not prepending any part of the new range to the existing list. - * Find where in the list it should go. This finds i_s, such that: - * invlist[i_s] <= start < array[i_s+1] - */ - i_s = _invlist_search(invlist, start); - } - - /* At this point, any extending before the beginning of the inversion list - * and/or after the end has been done. This has made it so that, in the - * code below, each endpoint of the new range is either in a range that is - * in the set, or is in a gap between two ranges that are. This means we - * don't have to worry about exceeding the array bounds. - * - * Find where in the list the new range ends (but we can skip this if we - * have already determined what it is, or if it will be the same as i_s, - * which we already have computed) */ - if (i_e == 0) { - i_e = (start == end) - ? i_s - : _invlist_search(invlist, end); - } - - /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] - * is a range that goes to infinity there is no element at invlist[i_e+1], - * so only the first relation holds. */ - - if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { - - /* Here, the ranges on either side of the beginning of the new range - * are in the set, and this range starts in the gap between them. - * - * The new range extends the range above it downwards if the new range - * ends at or above that range's start */ - const bool extends_the_range_above = ( end == UV_MAX - || end + 1 >= array[i_s+1]); - - /* The new range extends the range below it upwards if it begins just - * after where that range ends */ - if (start == array[i_s]) { - - /* If the new range fills the entire gap between the other ranges, - * they will get merged together. Other ranges may also get - * merged, depending on how many of them the new range spans. In - * the general case, we do the merge later, just once, after we - * figure out how many to merge. But in the case where the new - * range exactly spans just this one gap (possibly extending into - * the one above), we do the merge here, and an early exit. This - * is done here to avoid having to special case later. */ - if (i_e - i_s <= 1) { - - /* If i_e - i_s == 1, it means that the new range terminates - * within the range above, and hence 'extends_the_range_above' - * must be true. (If the range above it extends to infinity, - * 'i_s+2' will be above the array's limit, but 'len-i_s-2' - * will be 0, so no harm done.) */ - if (extends_the_range_above) { - Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); - invlist_set_len(invlist, - len - 2, - *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* Here, i_e must == i_s. We keep them in sync, as they apply - * to the same range, and below we are about to decrement i_s - * */ - i_e--; - } - - /* Here, the new range is adjacent to the one below. (It may also - * span beyond the range above, but that will get resolved later.) - * Extend the range below to include this one. */ - array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; - i_s--; - start = array[i_s]; - } - else if (extends_the_range_above) { - - /* Here the new range only extends the range above it, but not the - * one below. It merges with the one above. Again, we keep i_e - * and i_s in sync if they point to the same range */ - if (i_e == i_s) { - i_e++; - } - i_s++; - array[i_s] = start; - } - } - - /* Here, we've dealt with the new range start extending any adjoining - * existing ranges. - * - * If the new range extends to infinity, it is now the final one, - * regardless of what was there before */ - if (UNLIKELY(end == UV_MAX)) { - invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* If i_e started as == i_s, it has also been dealt with, - * and been updated to the new i_s, which will fail the following if */ - if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { - - /* Here, the ranges on either side of the end of the new range are in - * the set, and this range ends in the gap between them. - * - * If this range is adjacent to (hence extends) the range above it, it - * becomes part of that range; likewise if it extends the range below, - * it becomes part of that range */ - if (end + 1 == array[i_e+1]) { - i_e++; - array[i_e] = start; - } - else if (start <= array[i_e]) { - array[i_e] = end + 1; - i_e--; - } - } - - if (i_s == i_e) { - - /* If the range fits entirely in an existing range (as possibly already - * extended above), it doesn't add anything new */ - if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { - return invlist; - } - - /* Here, no part of the range is in the list. Must add it. It will - * occupy 2 more slots */ - splice_in_new_range: - - invlist_extend(invlist, len + 2); - array = invlist_array(invlist); - /* Move the rest of the array down two slots. Don't include any - * trailing NUL */ - Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); - - /* Do the actual splice */ - array[i_e+1] = start; - array[i_e+2] = end + 1; - invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); - return invlist; - } - - /* Here the new range crossed the boundaries of a pre-existing range. The - * code above has adjusted things so that both ends are in ranges that are - * in the set. This means everything in between must also be in the set. - * Just squash things together */ - Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); - invlist_set_len(invlist, - len - i_e + i_s, - *(get_invlist_offset_addr(invlist))); - - return invlist; -} - -SV* -Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, - UV** other_elements_ptr) -{ - /* Create and return an inversion list whose contents are to be populated - * by the caller. The caller gives the number of elements (in 'size') and - * the very first element ('element0'). This function will set - * '*other_elements_ptr' to an array of UVs, where the remaining elements - * are to be placed. - * - * Obviously there is some trust involved that the caller will properly - * fill in the other elements of the array. - * - * (The first element needs to be passed in, as the underlying code does - * things differently depending on whether it is zero or non-zero) */ - - SV* invlist = _new_invlist(size); - bool offset; - - PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; - - invlist = add_cp_to_invlist(invlist, element0); - offset = *get_invlist_offset_addr(invlist); - - invlist_set_len(invlist, size, offset); - *other_elements_ptr = invlist_array(invlist) + 1; - return invlist; -} - -#endif - -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_invert(pTHX_ SV* const invlist) -{ - /* Complement the input inversion list. This adds a 0 if the list didn't - * have a zero; removes it otherwise. As described above, the data - * structure is set up so that this is very efficient */ - - PERL_ARGS_ASSERT__INVLIST_INVERT; - - assert(! invlist_is_iterating(invlist)); - - /* The inverse of matching nothing is matching everything */ - if (_invlist_len(invlist) == 0) { - _append_range_to_invlist(invlist, 0, UV_MAX); - return; - } - - *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); -} - -SV* -Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) -{ - /* Return a new inversion list that is a copy of the input one, which is - * unchanged. The new list will not be mortal even if the old one was. */ - - const STRLEN nominal_length = _invlist_len(invlist); - const STRLEN physical_length = SvCUR(invlist); - const bool offset = *(get_invlist_offset_addr(invlist)); - - PERL_ARGS_ASSERT_INVLIST_CLONE; - - if (new_invlist == NULL) { - new_invlist = _new_invlist(nominal_length); - } - else { - sv_upgrade(new_invlist, SVt_INVLIST); - initialize_invlist_guts(new_invlist, nominal_length); - } - - *(get_invlist_offset_addr(new_invlist)) = offset; - invlist_set_len(new_invlist, nominal_length, offset); - Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); - - return new_invlist; -} - -#endif - -PERL_STATIC_INLINE UV -S_invlist_lowest(SV* const invlist) -{ - /* Returns the lowest code point that matches an inversion list. This API - * has an ambiguity, as it returns 0 under either the lowest is actually - * 0, or if the list is empty. If this distinction matters to you, check - * for emptiness before calling this function */ - - UV len = _invlist_len(invlist); - UV *array; - - PERL_ARGS_ASSERT_INVLIST_LOWEST; - - if (len == 0) { - return 0; - } - - array = invlist_array(invlist); - - return array[0]; -} - -STATIC SV * -S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) -{ - /* Get the contents of an inversion list into a string SV so that they can - * be printed out. If 'traditional_style' is TRUE, it uses the format - * traditionally done for debug tracing; otherwise it uses a format - * suitable for just copying to the output, with blanks between ranges and - * a dash between range components */ - - UV start, end; - SV* output; - const char intra_range_delimiter = (traditional_style ? '\t' : '-'); - const char inter_range_delimiter = (traditional_style ? '\n' : ' '); - - if (traditional_style) { - output = newSVpvs("\n"); - } - else { - output = newSVpvs(""); - } - - PERL_ARGS_ASSERT_INVLIST_CONTENTS; - - assert(! invlist_is_iterating(invlist)); - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c", - start, intra_range_delimiter, - inter_range_delimiter); - } - else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c", - start, - intra_range_delimiter, - end, inter_range_delimiter); - } - else { - Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c", - start, inter_range_delimiter); - } - } - - if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ - SvCUR_set(output, SvCUR(output) - 1); - } - - return output; -} - -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, - const char * const indent, SV* const invlist) -{ - /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the - * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by - * the string 'indent'. The output looks like this: - [0] 0x000A .. 0x000D - [2] 0x0085 - [4] 0x2028 .. 0x2029 - [6] 0x3104 .. INFTY - * This means that the first range of code points matched by the list are - * 0xA through 0xD; the second range contains only the single code point - * 0x85, etc. An inversion list is an array of UVs. Two array elements - * are used to define each range (except if the final range extends to - * infinity, only a single element is needed). The array index of the - * first element for the corresponding range is given in brackets. */ - - UV start, end; - STRLEN count = 0; - - PERL_ARGS_ASSERT__INVLIST_DUMP; - - if (invlist_is_iterating(invlist)) { - Perl_dump_indent(aTHX_ level, file, - "%sCan't dump inversion list because is in middle of iterating\n", - indent); - return; - } - - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (end == UV_MAX) { - Perl_dump_indent(aTHX_ level, file, - "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n", - indent, (UV)count, start); - } - else if (end != start) { - Perl_dump_indent(aTHX_ level, file, - "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", - indent, (UV)count, start, end); - } - else { - Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", - indent, (UV)count, start); - } - count += 2; - } -} - -#endif - -#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) -bool -Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) -{ - /* Return a boolean as to if the two passed in inversion lists are - * identical. The final argument, if TRUE, says to take the complement of - * the second inversion list before doing the comparison */ - - const UV len_a = _invlist_len(a); - UV len_b = _invlist_len(b); - - const UV* array_a = NULL; - const UV* array_b = NULL; - - PERL_ARGS_ASSERT__INVLISTEQ; - - /* This code avoids accessing the arrays unless it knows the length is - * non-zero */ - - if (len_a == 0) { - if (len_b == 0) { - return ! complement_b; - } - } - else { - array_a = invlist_array(a); - } - - if (len_b != 0) { - array_b = invlist_array(b); - } - - /* If are to compare 'a' with the complement of b, set it - * up so are looking at b's complement. */ - if (complement_b) { - - /* The complement of nothing is everything, so <a> would have to have - * just one element, starting at zero (ending at infinity) */ - if (len_b == 0) { - return (len_a == 1 && array_a[0] == 0); - } - if (array_b[0] == 0) { - - /* Otherwise, to complement, we invert. Here, the first element is - * 0, just remove it. To do this, we just pretend the array starts - * one later */ - - array_b++; - len_b--; - } - else { - - /* But if the first element is not zero, we pretend the list starts - * at the 0 that is always stored immediately before the array. */ - array_b--; - len_b++; - } - } - - return len_a == len_b - && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); - -} -#endif - -/* - * As best we can, determine the characters that can match the start of - * the given EXACTF-ish node. This is for use in creating ssc nodes, so there - * can be false positive matches - * - * Returns the invlist as a new SV*; it is the caller's responsibility to - * call SvREFCNT_dec() when done with it. - */ -STATIC SV* -S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) -{ - const U8 * s = (U8*)STRING(node); - SSize_t bytelen = STR_LEN(node); - UV uc; - /* Start out big enough for 2 separate code points */ - SV* invlist = _new_invlist(4); - - PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; - - if (! UTF) { - uc = *s; - - /* We punt and assume can match anything if the node begins - * with a multi-character fold. Things are complicated. For - * example, /ffi/i could match any of: - * "\N{LATIN SMALL LIGATURE FFI}" - * "\N{LATIN SMALL LIGATURE FF}I" - * "F\N{LATIN SMALL LIGATURE FI}" - * plus several other things; and making sure we have all the - * possibilities is hard. */ - if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { - invlist = _add_range_to_invlist(invlist, 0, UV_MAX); - } - else { - /* Any Latin1 range character can potentially match any - * other depending on the locale, and in Turkic locales, 'I' and - * 'i' can match U+130 and U+131 */ - if (OP(node) == EXACTFL) { - _invlist_union(invlist, PL_Latin1, &invlist); - if (isALPHA_FOLD_EQ(uc, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - } - else { - /* But otherwise, it matches at least itself. We can - * quickly tell if it has a distinct fold, and if so, - * it matches that as well */ - invlist = add_cp_to_invlist(invlist, uc); - if (IS_IN_SOME_FOLD_L1(uc)) - invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); - } - - /* Some characters match above-Latin1 ones under /i. This - * is true of EXACTFL ones when the locale is UTF-8 */ - if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) - && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, - EXACTFAA_NO_TRIE))) - { - add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); - } - } - } - else { /* Pattern is UTF-8 */ - U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; - const U8* e = s + bytelen; - IV fc; - - fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); - - /* The only code points that aren't folded in a UTF EXACTFish - * node are the problematic ones in EXACTFL nodes */ - if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { - /* We need to check for the possibility that this EXACTFL - * node begins with a multi-char fold. Therefore we fold - * the first few characters of it so that we can make that - * check */ - U8 *d = folded; - int i; - - fc = -1; - for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { - if (isASCII(*s)) { - *(d++) = (U8) toFOLD(*s); - if (fc < 0) { /* Save the first fold */ - fc = *(d-1); - } - s++; - } - else { - STRLEN len; - UV fold = toFOLD_utf8_safe(s, e, d, &len); - if (fc < 0) { /* Save the first fold */ - fc = fold; - } - d += len; - s += UTF8SKIP(s); - } - } - - /* And set up so the code below that looks in this folded - * buffer instead of the node's string */ - e = d; - s = folded; - } - - /* When we reach here 's' points to the fold of the first - * character(s) of the node; and 'e' points to far enough along - * the folded string to be just past any possible multi-char - * fold. - * - * Like the non-UTF case above, we punt if the node begins with a - * multi-char fold */ - - if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { - invlist = _add_range_to_invlist(invlist, 0, UV_MAX); - } - else { /* Single char fold */ - unsigned int k; - U32 first_fold; - const U32 * remaining_folds; - Size_t folds_count; - - /* It matches itself */ - invlist = add_cp_to_invlist(invlist, fc); - - /* ... plus all the things that fold to it, which are found in - * PL_utf8_foldclosures */ - folds_count = _inverse_folds(fc, &first_fold, - &remaining_folds); - for (k = 0; k < folds_count; k++) { - UV c = (k == 0) ? first_fold : remaining_folds[k-1]; - - /* /aa doesn't allow folds between ASCII and non- */ - if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) - && isASCII(c) != isASCII(fc)) - { - continue; - } - - invlist = add_cp_to_invlist(invlist, c); - } - - if (OP(node) == EXACTFL) { - - /* If either [iI] are present in an EXACTFL node the above code - * should have added its normal case pair, but under a Turkish - * locale they could match instead the case pairs from it. Add - * those as potential matches as well */ - if (isALPHA_FOLD_EQ(fc, 'I')) { - invlist = add_cp_to_invlist(invlist, - LATIN_SMALL_LETTER_DOTLESS_I); - invlist = add_cp_to_invlist(invlist, - LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); - } - else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { - invlist = add_cp_to_invlist(invlist, 'I'); - } - else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { - invlist = add_cp_to_invlist(invlist, 'i'); - } - } - } - } - - return invlist; -} - -#undef HEADER_LENGTH -#undef TO_INTERNAL_SIZE -#undef FROM_INTERNAL_SIZE -#undef INVLIST_VERSION_ID - -/* End of inversion list object */ STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) @@ -11451,12 +2669,6 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) -#ifdef DEBUGGING -#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) -#else -#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) -#endif STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, @@ -11484,7 +2696,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, } if (sv_dat) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); + num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void_NN(sv_dat); } @@ -11708,7 +2920,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Too many nested open parens"); } - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ /* Having this true makes it feasible to have a lot fewer tests for the * parse pointer being in scope. For example, we can write @@ -12034,7 +3246,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_seen |= REG_VERBARG_SEEN; if (start_arg) { SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); - ARG(REGNODE_p(ret)) = add_data( pRExC_state, + ARG(REGNODE_p(ret)) = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv; FLAGS(REGNODE_p(ret)) = 1; @@ -12066,10 +3278,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_parse > RExC_end) { paren = '\0'; } - ret = 0; /* For look-ahead/behind. */ + ret = 0; /* For look-ahead/behind. */ switch (paren) { - case 'P': /* (?P...) variants for those used to PCRE/Python */ + case 'P': /* (?P...) variants for those used to PCRE/Python */ paren = *RExC_parse; if ( paren == '<') { /* (?P<...>) named capture */ RExC_parse_inc_by(1); @@ -12412,13 +3624,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_parse_set(RExC_start + cb->end); o = cb->block; if (cb->src_regex) { - n = add_data(pRExC_state, STR_WITH_LEN("rl")); + n = reg_add_data(pRExC_state, STR_WITH_LEN("rl")); RExC_rxi->data->data[n] = (void*)SvREFCNT_inc((SV*)cb->src_regex); RExC_rxi->data->data[n+1] = (void*)o; } else { - n = add_data(pRExC_state, + n = reg_add_data(pRExC_state, (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); RExC_rxi->data->data[n] = (void*)o; } @@ -12511,7 +3723,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } RExC_parse_inc_by(1); if (sv_dat) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); + num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void_NN(sv_dat); } @@ -12783,12 +3995,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) else if (paren == ':') { *flagp |= flags&SIMPLE; } - if (is_open) { /* Starts with OPEN. */ + if (is_open) { /* Starts with OPEN. */ if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ REQUIRE_BRANCHJ(flagp, 0); } } - else if (paren != '?') /* Not Conditional */ + else if (paren != '?') /* Not Conditional */ ret = br; *flagp |= flags & (HASWIDTH | POSTPONED); lastbr = br; @@ -13005,7 +4217,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Unmatched )"); } else - FAIL("Junk on end of regexp"); /* "Can't happen". */ + FAIL("Junk on end of regexp"); /* "Can't happen". */ NOT_REACHED; /* NOTREACHED */ } @@ -13052,7 +4264,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } } - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); @@ -13082,7 +4294,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) chain = latest; c++; } - if (chain == 0) { /* Loop ran zero times. */ + if (chain == 0) { /* Loop ran zero times. */ chain = reg_node(pRExC_state, NOTHING); if (ret == 0) ret = chain; @@ -14004,14 +5216,6 @@ S_backref_value(char *p, char *e) return I32_MAX; } -#ifdef DEBUGGING -#define REGNODE_GUTS(state,op,extra_size) \ - regnode_guts_debug(state,op,extra_size) -#else -#define REGNODE_GUTS(state,op,extra_size) \ - regnode_guts(state,extra_size) -#endif - /* - regatom - the lowest level @@ -14093,7 +5297,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DECLARE_AND_GET_RE_DEBUG_FLAGS; - *flagp = 0; /* Initialize. */ + *flagp = 0; /* Initialize. */ DEBUG_PARSE("atom"); @@ -14249,7 +5453,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, SEOL); } - RExC_seen_zerolen++; /* Do not optimize RE away */ + RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'z': if (RExC_pm_flags & PMf_WILDCARD) { @@ -14259,7 +5463,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) else { ret = reg_node(pRExC_state, EOS); } - RExC_seen_zerolen++; /* Do not optimize RE away */ + RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': vFAIL("\\C no longer supported"); @@ -16046,8 +7250,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } -STATIC void -S_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) +void +Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) { /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It * sets up the bitmap and any flags, removing those code points from the @@ -16962,7 +8166,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, /* Handle the (?[...]) construct to do set operations */ U8 curchar; /* Current character being parsed */ - UV start, end; /* End points of code point ranges */ + UV start, end; /* End points of code point ranges */ SV* final = NULL; /* The end result inversion list */ SV* result_string; /* 'final' stringified */ AV* stack; /* stack of operators and operands not yet @@ -17698,8 +8902,8 @@ S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, #undef IS_OPERATOR #undef IS_OPERAND -STATIC void -S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +void +Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) { /* This adds the Latin1/above-Latin1 folding rules. * @@ -18073,7 +9277,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, assert(RExC_parse <= RExC_end); - if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ + if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ RExC_parse_inc_by(1); invert = TRUE; allow_mutiple_chars = FALSE; @@ -18236,16 +9440,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, U32 packed_warn; U8 grok_c_char; - case 'w': namedclass = ANYOF_WORDCHAR; break; - case 'W': namedclass = ANYOF_NWORDCHAR; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'v': namedclass = ANYOF_VERTWS; break; - case 'V': namedclass = ANYOF_NVERTWS; break; - case 'h': namedclass = ANYOF_HORIZWS; break; - case 'H': namedclass = ANYOF_NHORIZWS; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; + case 's': namedclass = ANYOF_SPACE; break; + case 'S': namedclass = ANYOF_NSPACE; break; + case 'd': namedclass = ANYOF_DIGIT; break; + case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { const char * const backslash_N_beg = RExC_parse - 2; @@ -18544,15 +9748,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, named */ } break; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; - case 'e': value = ESC_NATIVE; break; - case 'a': value = '\a'; break; + case 'n': value = '\n'; break; + case 'r': value = '\r'; break; + case 't': value = '\t'; break; + case 'f': value = '\f'; break; + case 'b': value = '\b'; break; + case 'e': value = ESC_NATIVE; break; + case 'a': value = '\a'; break; case 'o': - RExC_parse--; /* function expects to be pointed at the 'o' */ + RExC_parse--; /* function expects to be pointed at the 'o' */ if (! grok_bslash_o(&RExC_parse, RExC_end, &value, @@ -18574,7 +9778,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } break; case 'x': - RExC_parse--; /* function expects to be pointed at the 'x' */ + RExC_parse--; /* function expects to be pointed at the 'x' */ if (! grok_bslash_x(&RExC_parse, RExC_end, &value, @@ -18886,8 +10090,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, cp_list = add_cp_to_invlist(cp_list, '-'); element_count++; } else - range = 1; /* yeah, it's a range! */ - continue; /* but do it the next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } } @@ -19277,7 +10481,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * ones already on the list */ if (cp_foldable_list) { if (FOLD) { - UV start, end; /* End points of code point ranges */ + UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; SV** use_list; @@ -20711,8 +11915,8 @@ S_optimize_regclass(pTHX_ #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION -STATIC void -S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, +void +Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, @@ -20724,7 +11928,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE * - * Otherwise, it sets the argument to the count returned by add_data(), + * Otherwise, it sets the argument to the count returned by reg_add_data(), * having allocated and stored an array, av, as follows: * av[0] stores the inversion list defining this class as far as known at * this time, or PL_sv_undef if nothing definite is now known. @@ -20863,7 +12067,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, } rv = newRV_noinc(MUTABLE_SV(av)); - n = add_data(pRExC_state, STR_WITH_LEN("s")); + n = reg_add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(node, n); } @@ -21437,7 +12641,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, StructCopy(--src, --dst, regnode); } - place = REGNODE_p(operand); /* Op node, where operand used to be. */ + place = REGNODE_p(operand); /* Op node, where operand used to be. */ src = place + 1; /* NOT REGNODE_AFTER! */ FLAGS(place) = 0; FILL_NODE(operand, op); @@ -21548,7 +12752,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, regnode * const temp = regnext(REGNODE_p(scan)); #ifdef EXPERIMENTAL_INPLACESCAN if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { - bool unfolded_multi_char; /* Unexamined in this routine */ + bool unfolded_multi_char; /* Unexamined in this routine */ if (join_exact(pRExC_state, scan, &min, &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) return TRUE; /* Was return EXACT */ @@ -21606,8 +12810,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, } #endif -STATIC SV* -S_get_ANYOFM_contents(pTHX_ const regnode * n) { +SV* +Perl_get_ANYOFM_contents(pTHX_ const regnode * n) { /* Returns an inversion list of all the code points matched by the * ANYOFM/NANYOFM node 'n' */ @@ -21639,8 +12843,8 @@ S_get_ANYOFM_contents(pTHX_ const regnode * n) { return cp_list; } -STATIC SV * -S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { +SV * +Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS; SV * cp_list = NULL; @@ -21655,716 +12859,11 @@ S_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { return cp_list; } -/* - - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form - */ -#ifdef DEBUGGING - -static void -S_regdump_intflags(pTHX_ const char *lead, const U32 flags) -{ - int bit; - int set=0; - - ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); - - for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { - if (flags & (1<<bit)) { - if (!set++ && lead) - Perl_re_printf( aTHX_ "%s", lead); - Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); - } - } - if (lead) { - if (set) - Perl_re_printf( aTHX_ "\n"); - else - Perl_re_printf( aTHX_ "%s[none-set]\n", lead); - } -} - -static void -S_regdump_extflags(pTHX_ const char *lead, const U32 flags) -{ - int bit; - int set=0; - regex_charset cs; - - ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); - - for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { - if (flags & (1U<<bit)) { - if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ - continue; - } - if (!set++ && lead) - Perl_re_printf( aTHX_ "%s", lead); - Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); - } - } - if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { - if (!set++ && lead) { - Perl_re_printf( aTHX_ "%s", lead); - } - switch (cs) { - case REGEX_UNICODE_CHARSET: - Perl_re_printf( aTHX_ "UNICODE"); - break; - case REGEX_LOCALE_CHARSET: - Perl_re_printf( aTHX_ "LOCALE"); - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); - break; - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); - break; - default: - Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); - break; - } - } - if (lead) { - if (set) - Perl_re_printf( aTHX_ "\n"); - else - Perl_re_printf( aTHX_ "%s[none-set]\n", lead); - } -} -#endif - -void -Perl_regdump(pTHX_ const regexp *r) -{ -#ifdef DEBUGGING - int i; - SV * const sv = sv_newmortal(); - SV *dsv= sv_newmortal(); - RXi_GET_DECL(r, ri); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REGDUMP; - - (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); - - /* Header fields of interest. */ - for (i = 0; i < 2; i++) { - if (r->substrs->data[i].substr) { - RE_PV_QUOTED_DECL(s, 0, dsv, - SvPVX_const(r->substrs->data[i].substr), - RE_SV_DUMPLEN(r->substrs->data[i].substr), - PL_dump_re_max_len); - Perl_re_printf( aTHX_ - "%s %s%s at %" IVdf "..%" UVuf " ", - i ? "floating" : "anchored", - s, - RE_SV_TAIL(r->substrs->data[i].substr), - (IV)r->substrs->data[i].min_offset, - (UV)r->substrs->data[i].max_offset); - } - else if (r->substrs->data[i].utf8_substr) { - RE_PV_QUOTED_DECL(s, 1, dsv, - SvPVX_const(r->substrs->data[i].utf8_substr), - RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), - 30); - Perl_re_printf( aTHX_ - "%s utf8 %s%s at %" IVdf "..%" UVuf " ", - i ? "floating" : "anchored", - s, - RE_SV_TAIL(r->substrs->data[i].utf8_substr), - (IV)r->substrs->data[i].min_offset, - (UV)r->substrs->data[i].max_offset); - } - } - - if (r->check_substr || r->check_utf8) - Perl_re_printf( aTHX_ - (const char *) - ( r->check_substr == r->substrs->data[1].substr - && r->check_utf8 == r->substrs->data[1].utf8_substr - ? "(checking floating" : "(checking anchored")); - if (r->intflags & PREGf_NOSCAN) - Perl_re_printf( aTHX_ " noscan"); - if (r->extflags & RXf_CHECK_ALL) - Perl_re_printf( aTHX_ " isall"); - if (r->check_substr || r->check_utf8) - Perl_re_printf( aTHX_ ") "); - - if (ri->regstclass) { - regprop(r, sv, ri->regstclass, NULL, NULL); - Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); - } - if (r->intflags & PREGf_ANCH) { - Perl_re_printf( aTHX_ "anchored"); - if (r->intflags & PREGf_ANCH_MBOL) - Perl_re_printf( aTHX_ "(MBOL)"); - if (r->intflags & PREGf_ANCH_SBOL) - Perl_re_printf( aTHX_ "(SBOL)"); - if (r->intflags & PREGf_ANCH_GPOS) - Perl_re_printf( aTHX_ "(GPOS)"); - Perl_re_printf( aTHX_ " "); - } - if (r->intflags & PREGf_GPOS_SEEN) - Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); - if (r->intflags & PREGf_SKIP) - Perl_re_printf( aTHX_ "plus "); - if (r->intflags & PREGf_IMPLICIT) - Perl_re_printf( aTHX_ "implicit "); - Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); - if (r->extflags & RXf_EVAL_SEEN) - Perl_re_printf( aTHX_ "with eval "); - Perl_re_printf( aTHX_ "\n"); - DEBUG_FLAGS_r({ - regdump_extflags("r->extflags: ", r->extflags); - regdump_intflags("r->intflags: ", r->intflags); - }); -#else - PERL_ARGS_ASSERT_REGDUMP; - PERL_UNUSED_CONTEXT; - PERL_UNUSED_ARG(r); -#endif /* DEBUGGING */ -} - -/* Should be synchronized with ANYOF_ #defines in regcomp.h */ -#ifdef DEBUGGING - -# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ - || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ - || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ - || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ - || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ - || CC_VERTSPACE_ != 15 -# error Need to adjust order of anyofs[] -# endif -static const char * const anyofs[] = { - "\\w", - "\\W", - "\\d", - "\\D", - "[:alpha:]", - "[:^alpha:]", - "[:lower:]", - "[:^lower:]", - "[:upper:]", - "[:^upper:]", - "[:punct:]", - "[:^punct:]", - "[:print:]", - "[:^print:]", - "[:alnum:]", - "[:^alnum:]", - "[:graph:]", - "[:^graph:]", - "[:cased:]", - "[:^cased:]", - "\\s", - "\\S", - "[:blank:]", - "[:^blank:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:ascii:]", - "[:^ascii:]", - "\\v", - "\\V" -}; -#endif - -/* -- regprop - printable representation of opcode, with run time support -*/ - -void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) -{ -#ifdef DEBUGGING - U8 k; - const U8 op = OP(o); - RXi_GET_DECL(prog, progi); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_REGPROP; - - SvPVCLEAR(sv); - - if (op > REGNODE_MAX) { /* regnode.type is unsigned */ - if (pRExC_state) { /* This gives more info, if we have it */ - FAIL3("panic: corrupted regexp opcode %d > %d", - (int)op, (int)REGNODE_MAX); - } - else { - Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", - (int)op, (int)REGNODE_MAX); - } - } - sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ - - k = REGNODE_TYPE(op); - - if (k == EXACT) { - sv_catpvs(sv, " "); - /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) - * is a crude hack but it may be the best for now since - * we have no flag "this EXACTish node was UTF-8" - * --jhi */ - pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_ESCAPE_NONASCII | - PERL_PV_PRETTY_ELLIPSES | - PERL_PV_PRETTY_LTGT | - PERL_PV_PRETTY_NOCLEAR - ); - } else if (k == TRIE) { - /* print the details of the trie in dumpuntil instead, as - * progi->data isn't available here */ - const U32 n = ARG(o); - const reg_ac_data * const ac = IS_TRIE_AC(op) ? - (reg_ac_data *)progi->data->data[n] : - NULL; - const reg_trie_data * const trie - = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - - Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags)); - DEBUG_TRIE_COMPILE_r({ - if (trie->jump) - sv_catpvs(sv, "(JUMP)"); - Perl_sv_catpvf(aTHX_ sv, - "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", - (UV)trie->startstate, - (IV)trie->statecount-1, /* -1 because of the unused 0 element */ - (UV)trie->wordcount, - (UV)trie->minlen, - (UV)trie->maxlen, - (UV)TRIE_CHARCOUNT(trie), - (UV)trie->uniquecharcount - ); - }); - if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { - sv_catpvs(sv, "["); - (void) put_charclass_bitmap_innards(sv, - ((IS_ANYOF_TRIE(op)) - ? ANYOF_BITMAP(o) - : TRIE_BITMAP(trie)), - NULL, - NULL, - NULL, - 0, - FALSE - ); - sv_catpvs(sv, "]"); - } - } else if (k == CURLY) { - U32 lo = ARG1(o), hi = ARG2(o); - if (op == CURLYM || op == CURLYN || op == CURLYX) - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ - Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); - if (hi == REG_INFTY) - sv_catpvs(sv, "INFTY"); - else - Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); - sv_catpvs(sv, "}"); - } - else if (k == WHILEM && o->flags) /* Ordinal/of */ - Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); - else if (k == REF || k == OPEN || k == CLOSE - || k == GROUPP || op == ACCEPT) - { - AV *name_list= NULL; - U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) : - (op == OPEN || op == CLOSE) ? (U32)PARNO(o) : - (U32)ARG(o); - Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ - if ( RXp_PAREN_NAMES(prog) ) { - name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); - } else if ( pRExC_state ) { - name_list= RExC_paren_name_list; - } - if ( name_list ) { - if ( k != REF || (op < REFN)) { - SV **name= av_fetch_simple(name_list, parno, 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - else - if (parno > 0) { - /* parno must always be larger than 0 for this block - * as it represents a slot into the data array, which - * has the 0 slot reserved for a placeholder so any valid - * index into it is always true, eg non-zero - * see the '%' "what" type and the implementation of - * S_add_data() - */ - SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); - I32 *nums=(I32*)SvPVX(sv_dat); - SV **name= av_fetch_simple(name_list, nums[0], 0 ); - I32 n; - if (name) { - for ( n=0; n<SvIVX(sv_dat); n++ ) { - Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, - (n ? "," : ""), (IV)nums[n]); - } - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - } - } - if ( k == REF && reginfo) { - U32 n = ARG(o); /* which paren pair */ - I32 ln = prog->offs[n].start; - if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1) - Perl_sv_catpvf(aTHX_ sv, ": FAIL"); - else if (ln == prog->offs[n].end) - Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); - else { - const char *s = reginfo->strbeg + ln; - Perl_sv_catpvf(aTHX_ sv, ": "); - Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, - PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); - } - } - } else if (k == GOSUB) { - AV *name_list= NULL; - if ( RXp_PAREN_NAMES(prog) ) { - name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); - } else if ( pRExC_state ) { - name_list= RExC_paren_name_list; - } - - /* Paren and offset */ - Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), - (int)((o + (int)ARG2L(o)) - progi->program) ); - if (name_list) { - SV **name= av_fetch_simple(name_list, ARG(o), 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); - } - } - else if (k == LOGICAL) - /* 2: embedded, otherwise 1 */ - Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); - else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { - U8 flags; - char * bitmap; - U8 do_sep = 0; /* Do we need to separate various components of the - output? */ - /* Set if there is still an unresolved user-defined property */ - SV *unresolved = NULL; - - /* Things that are ignored except when the runtime locale is UTF-8 */ - SV *only_utf8_locale_invlist = NULL; - - /* Code points that don't fit in the bitmap */ - SV *nonbitmap_invlist = NULL; - - /* And things that aren't in the bitmap, but are small enough to be */ - SV* bitmap_range_not_in_bitmap = NULL; - - bool inverted; - - if (k != ANYOF) { - flags = 0; - bitmap = NULL; - } - else { - flags = ANYOF_FLAGS(o); - bitmap = ANYOF_BITMAP(o); - } - - if (op == ANYOFL || op == ANYOFPOSIXL) { - if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { - sv_catpvs(sv, "{utf8-locale-reqd}"); - } - if (flags & ANYOFL_FOLD) { - sv_catpvs(sv, "{i}"); - } - } - - inverted = flags & ANYOF_INVERT; - - /* If there is stuff outside the bitmap, get it */ - if (k == ANYOFR) { - - /* For a single range, split into the parts inside vs outside the - * bitmap. */ - UV start = ANYOFRbase(o); - UV end = ANYOFRbase(o) + ANYOFRdelta(o); - - if (start < NUM_ANYOF_CODE_POINTS) { - if (end < NUM_ANYOF_CODE_POINTS) { - bitmap_range_not_in_bitmap - = _add_range_to_invlist(bitmap_range_not_in_bitmap, - start, end); - } - else { - bitmap_range_not_in_bitmap - = _add_range_to_invlist(bitmap_range_not_in_bitmap, - start, NUM_ANYOF_CODE_POINTS); - start = NUM_ANYOF_CODE_POINTS; - } - } - - if (start >= NUM_ANYOF_CODE_POINTS) { - nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, - ANYOFRbase(o), - ANYOFRbase(o) + ANYOFRdelta(o)); - } - } - else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { - nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, - NUM_ANYOF_CODE_POINTS, - UV_MAX); - } - else if (ANYOF_HAS_AUX(o)) { - (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, - &unresolved, - &only_utf8_locale_invlist, - &nonbitmap_invlist); - - /* The aux data may contain stuff that could fit in the bitmap. - * This could come from a user-defined property being finally - * resolved when this call was done; or much more likely because - * there are matches that require UTF-8 to be valid, and so aren't - * in the bitmap (or ANYOFR). This is teased apart later */ - _invlist_intersection(nonbitmap_invlist, - PL_InBitmap, - &bitmap_range_not_in_bitmap); - /* Leave just the things that don't fit into the bitmap */ - _invlist_subtract(nonbitmap_invlist, - PL_InBitmap, - &nonbitmap_invlist); - } - - /* Ready to start outputting. First, the initial left bracket */ - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - - if ( bitmap - || bitmap_range_not_in_bitmap - || only_utf8_locale_invlist - || unresolved) - { - /* Then all the things that could fit in the bitmap */ - do_sep = put_charclass_bitmap_innards( - sv, - bitmap, - bitmap_range_not_in_bitmap, - only_utf8_locale_invlist, - o, - flags, - - /* Can't try inverting for a - * better display if there - * are things that haven't - * been resolved */ - (unresolved != NULL || k == ANYOFR)); - SvREFCNT_dec(bitmap_range_not_in_bitmap); - - /* If there are user-defined properties which haven't been defined - * yet, output them. If the result is not to be inverted, it is - * clearest to output them in a separate [] from the bitmap range - * stuff. If the result is to be complemented, we have to show - * everything in one [], as the inversion applies to the whole - * thing. Use {braces} to separate them from anything in the - * bitmap and anything above the bitmap. */ - if (unresolved) { - if (inverted) { - if (! do_sep) { /* If didn't output anything in the bitmap - */ - sv_catpvs(sv, "^"); - } - sv_catpvs(sv, "{"); - } - else if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], - PL_colors[0]); - } - sv_catsv(sv, unresolved); - if (inverted) { - sv_catpvs(sv, "}"); - } - do_sep = ! inverted; - } - else if ( do_sep == 2 - && ! nonbitmap_invlist - && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) - { - /* Here, the display shows the class as inverted, and - * everything above the lower display should also match, but - * there is no indication of that. Add this range so the code - * below will add it to the display */ - _invlist_union_complement_2nd(nonbitmap_invlist, - PL_InBitmap, - &nonbitmap_invlist); - } - } - - /* And, finally, add the above-the-bitmap stuff */ - if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { - SV* contents; - - /* See if truncation size is overridden */ - const STRLEN dump_len = (PL_dump_re_max_len > 256) - ? PL_dump_re_max_len - : 256; - - /* This is output in a separate [] */ - if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); - } - - /* And, for easy of understanding, it is shown in the - * uncomplemented form if possible. The one exception being if - * there are unresolved items, where the inversion has to be - * delayed until runtime */ - if (inverted && ! unresolved) { - _invlist_invert(nonbitmap_invlist); - _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); - } - - contents = invlist_contents(nonbitmap_invlist, - FALSE /* output suitable for catsv */ - ); - - /* If the output is shorter than the permissible maximum, just do it. */ - if (SvCUR(contents) <= dump_len) { - sv_catsv(sv, contents); - } - else { - const char * contents_string = SvPVX(contents); - STRLEN i = dump_len; - - /* Otherwise, start at the permissible max and work back to the - * first break possibility */ - while (i > 0 && contents_string[i] != ' ') { - i--; - } - if (i == 0) { /* Fail-safe. Use the max if we couldn't - find a legal break */ - i = dump_len; - } - - sv_catpvn(sv, contents_string, i); - sv_catpvs(sv, "..."); - } - - SvREFCNT_dec_NN(contents); - SvREFCNT_dec_NN(nonbitmap_invlist); - } - - /* And finally the matching, closing ']' */ - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - if (op == ANYOFHs) { - Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); - } - else if (REGNODE_TYPE(op) != ANYOF) { - U8 lowest = (op != ANYOFHr) - ? FLAGS(o) - : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); - U8 highest = (op == ANYOFHr) - ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) - : (op == ANYOFH || op == ANYOFR) - ? 0xFF - : lowest; -#ifndef EBCDIC - if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) -#endif - { - Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); - if (lowest != highest) { - Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); - } - Perl_sv_catpvf(aTHX_ sv, ")"); - } - } - - SvREFCNT_dec(unresolved); - } - else if (k == ANYOFM) { - SV * cp_list = get_ANYOFM_contents(o); - - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (op == NANYOFM) { - _invlist_invert(cp_list); - } - - put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - SvREFCNT_dec(cp_list); - } - else if (k == ANYOFHbbm) { - SV * cp_list = get_ANYOFHbbm_contents(o); - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - - sv_catsv(sv, invlist_contents(cp_list, - FALSE /* output suitable for catsv */ - )); - Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); - - SvREFCNT_dec(cp_list); - } - else if (k == POSIXD || k == NPOSIXD) { - U8 index = FLAGS(o) * 2; - if (index < C_ARRAY_LENGTH(anyofs)) { - if (*anyofs[index] != '[') { - sv_catpvs(sv, "["); - } - sv_catpv(sv, anyofs[index]); - if (*anyofs[index] != '[') { - sv_catpvs(sv, "]"); - } - } - else { - Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); - } - } - else if (k == BOUND || k == NBOUND) { - /* Must be synced with order of 'bound_type' in regcomp.h */ - const char * const bounds[] = { - "", /* Traditional */ - "{gcb}", - "{lb}", - "{sb}", - "{wb}" - }; - assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); - sv_catpv(sv, bounds[FLAGS(o)]); - } - else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { - Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); - if (o->next_off) { - Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); - } - Perl_sv_catpvf(aTHX_ sv, "]"); - } - else if (op == SBOL) - Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); - - /* add on the verb argument if there is one */ - if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) { - if ( ARG(o) ) - Perl_sv_catpvf(aTHX_ sv, ":%" SVf, - SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); - else - sv_catpvs(sv, ":NULL"); - } -#else - PERL_UNUSED_CONTEXT; - PERL_UNUSED_ARG(sv); - PERL_UNUSED_ARG(o); - PERL_UNUSED_ARG(prog); - PERL_UNUSED_ARG(reginfo); - PERL_UNUSED_ARG(pRExC_state); -#endif /* DEBUGGING */ -} - SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) -{ /* Assume that RE_INTUIT is set */ +{ /* Assume that RE_INTUIT is set */ /* Returns an SV containing a string that must appear in the target for it * to match, or NULL if nothing is known that must match. * @@ -22686,7 +13185,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } break; case '%': - /* NO-OP a '%' data contains a null pointer, so that add_data + /* NO-OP a '%' data contains a null pointer, so that reg_add_data * always returns non-zero, this should only ever happen in the * 0 index */ assert(n==0); @@ -22703,9 +13202,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) Safefree(ri); } -#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t)) -#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t)) -#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) +#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t)) +#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t)) +#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) /* =for apidoc re_dup_guts @@ -22921,7 +13420,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) break; case '%': /* this is a placeholder type, it exists purely so that - * add_data always returns a non-zero value, this type of + * reg_add_data always returns a non-zero value, this type of * entry should ONLY be present in the 0 slot of the array */ assert(i == 0); d->data[i]= ri->data->data[i]; @@ -23014,778 +13513,6 @@ Perl_save_re_context(pTHX) } #endif -#ifdef DEBUGGING - -STATIC void -S_put_code_point(pTHX_ SV *sv, UV c) -{ - PERL_ARGS_ASSERT_PUT_CODE_POINT; - - if (c > 255) { - Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); - } - else if (isPRINT(c)) { - const char string = (char) c; - - /* We use {phrase} as metanotation in the class, so also escape literal - * braces */ - if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') - sv_catpvs(sv, "\\"); - sv_catpvn(sv, &string, 1); - } - else if (isMNEMONIC_CNTRL(c)) { - Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); - } -} - -STATIC void -S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) -{ - /* Appends to 'sv' a displayable version of the range of code points from - * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls - * that have them, when they occur at the beginning or end of the range. - * It uses hex to output the remaining code points, unless 'allow_literals' - * is true, in which case the printable ASCII ones are output as-is (though - * some of these will be escaped by put_code_point()). - * - * NOTE: This is designed only for printing ranges of code points that fit - * inside an ANYOF bitmap. Higher code points are simply suppressed - */ - - const unsigned int min_range_count = 3; - - assert(start <= end); - - PERL_ARGS_ASSERT_PUT_RANGE; - - while (start <= end) { - UV this_end; - const char * format; - - if ( end - start < min_range_count - && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) - { - /* Output a range of 1 or 2 chars individually, or longer ranges - * when printable */ - for (; start <= end; start++) { - put_code_point(sv, start); - } - break; - } - - /* If permitted by the input options, and there is a possibility that - * this range contains a printable literal, look to see if there is - * one. */ - if (allow_literals && start <= MAX_PRINT_A) { - - /* If the character at the beginning of the range isn't an ASCII - * printable, effectively split the range into two parts: - * 1) the portion before the first such printable, - * 2) the rest - * and output them separately. */ - if (! isPRINT_A(start)) { - UV temp_end = start + 1; - - /* There is no point looking beyond the final possible - * printable, in MAX_PRINT_A */ - UV max = MIN(end, MAX_PRINT_A); - - while (temp_end <= max && ! isPRINT_A(temp_end)) { - temp_end++; - } - - /* Here, temp_end points to one beyond the first printable if - * found, or to one beyond 'max' if not. If none found, make - * sure that we use the entire range */ - if (temp_end > MAX_PRINT_A) { - temp_end = end + 1; - } - - /* Output the first part of the split range: the part that - * doesn't have printables, with the parameter set to not look - * for literals (otherwise we would infinitely recurse) */ - put_range(sv, start, temp_end - 1, FALSE); - - /* The 2nd part of the range (if any) starts here. */ - start = temp_end; - - /* We do a continue, instead of dropping down, because even if - * the 2nd part is non-empty, it could be so short that we want - * to output it as individual characters, as tested for at the - * top of this loop. */ - continue; - } - - /* Here, 'start' is a printable ASCII. If it is an alphanumeric, - * output a sub-range of just the digits or letters, then process - * the remaining portion as usual. */ - if (isALPHANUMERIC_A(start)) { - UV mask = (isDIGIT_A(start)) - ? CC_DIGIT_ - : isUPPER_A(start) - ? CC_UPPER_ - : CC_LOWER_; - UV temp_end = start + 1; - - /* Find the end of the sub-range that includes just the - * characters in the same class as the first character in it */ - while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { - temp_end++; - } - temp_end--; - - /* For short ranges, don't duplicate the code above to output - * them; just call recursively */ - if (temp_end - start < min_range_count) { - put_range(sv, start, temp_end, FALSE); - } - else { /* Output as a range */ - put_code_point(sv, start); - sv_catpvs(sv, "-"); - put_code_point(sv, temp_end); - } - start = temp_end + 1; - continue; - } - - /* We output any other printables as individual characters */ - if (isPUNCT_A(start) || isSPACE_A(start)) { - while (start <= end && (isPUNCT_A(start) - || isSPACE_A(start))) - { - put_code_point(sv, start); - start++; - } - continue; - } - } /* End of looking for literals */ - - /* Here is not to output as a literal. Some control characters have - * mnemonic names. Split off any of those at the beginning and end of - * the range to print mnemonically. It isn't possible for many of - * these to be in a row, so this won't overwhelm with output */ - if ( start <= end - && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) - { - while (isMNEMONIC_CNTRL(start) && start <= end) { - put_code_point(sv, start); - start++; - } - - /* If this didn't take care of the whole range ... */ - if (start <= end) { - - /* Look backwards from the end to find the final non-mnemonic - * */ - UV temp_end = end; - while (isMNEMONIC_CNTRL(temp_end)) { - temp_end--; - } - - /* And separately output the interior range that doesn't start - * or end with mnemonics */ - put_range(sv, start, temp_end, FALSE); - - /* Then output the mnemonic trailing controls */ - start = temp_end + 1; - while (start <= end) { - put_code_point(sv, start); - start++; - } - break; - } - } - - /* As a final resort, output the range or subrange as hex. */ - - if (start >= NUM_ANYOF_CODE_POINTS) { - this_end = end; - } - else { /* Have to split range at the bitmap boundary */ - this_end = (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1; - } -#if NUM_ANYOF_CODE_POINTS > 256 - format = (this_end < 256) - ? "\\x%02" UVXf "-\\x%02" UVXf - : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; -#else - format = "\\x%02" UVXf "-\\x%02" UVXf; -#endif - GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); - Perl_sv_catpvf(aTHX_ sv, format, start, this_end); - GCC_DIAG_RESTORE_STMT; - break; - } -} - -STATIC void -S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) -{ - /* Concatenate onto the PV in 'sv' a displayable form of the inversion list - * 'invlist' */ - - UV start, end; - bool allow_literals = TRUE; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; - - /* Generally, it is more readable if printable characters are output as - * literals, but if a range (nearly) spans all of them, it's best to output - * it as a single range. This code will use a single range if all but 2 - * ASCII printables are in it */ - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - - /* If the range starts beyond the final printable, it doesn't have any - * in it */ - if (start > MAX_PRINT_A) { - break; - } - - /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span - * all but two, the range must start and end no later than 2 from - * either end */ - if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { - if (end > MAX_PRINT_A) { - end = MAX_PRINT_A; - } - if (start < ' ') { - start = ' '; - } - if (end - start >= MAX_PRINT_A - ' ' - 2) { - allow_literals = FALSE; - } - break; - } - } - invlist_iterfinish(invlist); - - /* Here we have figured things out. Output each range */ - invlist_iterinit(invlist); - while (invlist_iternext(invlist, &start, &end)) { - if (start >= NUM_ANYOF_CODE_POINTS) { - break; - } - put_range(sv, start, end, allow_literals); - } - invlist_iterfinish(invlist); - - return; -} - -STATIC SV* -S_put_charclass_bitmap_innards_common(pTHX_ - SV* invlist, /* The bitmap */ - SV* posixes, /* Under /l, things like [:word:], \S */ - SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ - SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ - SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ - const bool invert /* Is the result to be inverted? */ -) -{ - /* Create and return an SV containing a displayable version of the bitmap - * and associated information determined by the input parameters. If the - * output would have been only the inversion indicator '^', NULL is instead - * returned. */ - - SV * output; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; - - if (invert) { - output = newSVpvs("^"); - } - else { - output = newSVpvs(""); - } - - /* First, the code points in the bitmap that are unconditionally there */ - put_charclass_bitmap_innards_invlist(output, invlist); - - /* Traditionally, these have been placed after the main code points */ - if (posixes) { - sv_catsv(output, posixes); - } - - if (only_utf8 && _invlist_len(only_utf8)) { - Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, only_utf8); - } - - if (not_utf8 && _invlist_len(not_utf8)) { - Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, not_utf8); - } - - if (only_utf8_locale && _invlist_len(only_utf8_locale)) { - Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); - put_charclass_bitmap_innards_invlist(output, only_utf8_locale); - - /* This is the only list in this routine that can legally contain code - * points outside the bitmap range. The call just above to - * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so - * output them here. There's about a half-dozen possible, and none in - * contiguous ranges longer than 2 */ - if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { - UV start, end; - SV* above_bitmap = NULL; - - _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); - - invlist_iterinit(above_bitmap); - while (invlist_iternext(above_bitmap, &start, &end)) { - UV i; - - for (i = start; i <= end; i++) { - put_code_point(output, i); - } - } - invlist_iterfinish(above_bitmap); - SvREFCNT_dec_NN(above_bitmap); - } - } - - if (invert && SvCUR(output) == 1) { - return NULL; - } - - return output; -} - -STATIC U8 -S_put_charclass_bitmap_innards(pTHX_ SV *sv, - char *bitmap, - SV *nonbitmap_invlist, - SV *only_utf8_locale_invlist, - const regnode * const node, - const U8 flags, - const bool force_as_is_display) -{ - /* Appends to 'sv' a displayable version of the innards of the bracketed - * character class defined by the other arguments: - * 'bitmap' points to the bitmap, or NULL if to ignore that. - * 'nonbitmap_invlist' is an inversion list of the code points that are in - * the bitmap range, but for some reason aren't in the bitmap; NULL if - * none. The reasons for this could be that they require some - * condition such as the target string being or not being in UTF-8 - * (under /d), or because they came from a user-defined property that - * was not resolved at the time of the regex compilation (under /u) - * 'only_utf8_locale_invlist' is an inversion list of the code points that - * are valid only if the runtime locale is a UTF-8 one; NULL if none - * 'node' is the regex pattern ANYOF node. It is needed only when the - * above two parameters are not null, and is passed so that this - * routine can tease apart the various reasons for them. - * 'flags' is the flags field of 'node' - * 'force_as_is_display' is TRUE if this routine should definitely NOT try - * to invert things to see if that leads to a cleaner display. If - * FALSE, this routine is free to use its judgment about doing this. - * - * It returns 0 if nothing was actually output. (It may be that - * the bitmap, etc is empty.) - * 1 if the output wasn't inverted (didn't begin with a '^') - * 2 if the output was inverted (did begin with a '^') - * - * When called for outputting the bitmap of a non-ANYOF node, just pass the - * bitmap, with the succeeding parameters set to NULL, and the final one to - * FALSE. - */ - - /* In general, it tries to display the 'cleanest' representation of the - * innards, choosing whether to display them inverted or not, regardless of - * whether the class itself is to be inverted. However, there are some - * cases where it can't try inverting, as what actually matches isn't known - * until runtime, and hence the inversion isn't either. */ - - bool inverting_allowed = ! force_as_is_display; - - int i; - STRLEN orig_sv_cur = SvCUR(sv); - - SV* invlist; /* Inversion list we accumulate of code points that - are unconditionally matched */ - SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is - UTF-8 */ - SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 - */ - SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ - SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale - is UTF-8 */ - - SV* as_is_display; /* The output string when we take the inputs - literally */ - SV* inverted_display; /* The output string when we invert the inputs */ - - bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted - to match? */ - /* We are biased in favor of displaying things without them being inverted, - * as that is generally easier to understand */ - const int bias = 5; - - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; - - /* Start off with whatever code points are passed in. (We clone, so we - * don't change the caller's list) */ - if (nonbitmap_invlist) { - assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); - invlist = invlist_clone(nonbitmap_invlist, NULL); - } - else { /* Worst case size is every other code point is matched */ - invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); - } - - if (flags) { - if (OP(node) == ANYOFD) { - - /* This flag indicates that the code points below 0x100 in the - * nonbitmap list are precisely the ones that match only when the - * target is UTF-8 (they should all be non-ASCII). */ - if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { - _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); - _invlist_subtract(invlist, only_utf8, &invlist); - } - - /* And this flag for matching all non-ASCII 0xFF and below */ - if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { - not_utf8 = invlist_clone(PL_UpperLatin1, NULL); - } - } - else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { - - /* If either of these flags are set, what matches isn't - * determinable except during execution, so don't know enough here - * to invert */ - if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { - inverting_allowed = FALSE; - } - - /* What the posix classes match also varies at runtime, so these - * will be output symbolically. */ - if (ANYOF_POSIXL_TEST_ANY_SET(node)) { - int i; - - posixes = newSVpvs(""); - for (i = 0; i < ANYOF_POSIXL_MAX; i++) { - if (ANYOF_POSIXL_TEST(node, i)) { - sv_catpv(posixes, anyofs[i]); - } - } - } - } - } - - /* Accumulate the bit map into the unconditional match list */ - if (bitmap) { - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST(bitmap, i)) { - int start = i++; - for (; - i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); - i++) - { /* empty */ } - invlist = _add_range_to_invlist(invlist, start, i-1); - } - } - } - - /* Make sure that the conditional match lists don't have anything in them - * that match unconditionally; otherwise the output is quite confusing. - * This could happen if the code that populates these misses some - * duplication. */ - if (only_utf8) { - _invlist_subtract(only_utf8, invlist, &only_utf8); - } - if (not_utf8) { - _invlist_subtract(not_utf8, invlist, ¬_utf8); - } - - if (only_utf8_locale_invlist) { - - /* Since this list is passed in, we have to make a copy before - * modifying it */ - only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); - - _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); - - /* And, it can get really weird for us to try outputting an inverted - * form of this list when it has things above the bitmap, so don't even - * try */ - if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { - inverting_allowed = FALSE; - } - } - - /* Calculate what the output would be if we take the input as-is */ - as_is_display = put_charclass_bitmap_innards_common(invlist, - posixes, - only_utf8, - not_utf8, - only_utf8_locale, - invert); - - /* If have to take the output as-is, just do that */ - if (! inverting_allowed) { - if (as_is_display) { - sv_catsv(sv, as_is_display); - SvREFCNT_dec_NN(as_is_display); - } - } - else { /* But otherwise, create the output again on the inverted input, and - use whichever version is shorter */ - - int inverted_bias, as_is_bias; - - /* We will apply our bias to whichever of the results doesn't have - * the '^' */ - bool trial_invert; - if (invert) { - trial_invert = FALSE; - as_is_bias = bias; - inverted_bias = 0; - } - else { - trial_invert = TRUE; - as_is_bias = 0; - inverted_bias = bias; - } - - /* Now invert each of the lists that contribute to the output, - * excluding from the result things outside the possible range */ - - /* For the unconditional inversion list, we have to add in all the - * conditional code points, so that when inverted, they will be gone - * from it */ - _invlist_union(only_utf8, invlist, &invlist); - _invlist_union(not_utf8, invlist, &invlist); - _invlist_union(only_utf8_locale, invlist, &invlist); - _invlist_invert(invlist); - _invlist_intersection(invlist, PL_InBitmap, &invlist); - - if (only_utf8) { - _invlist_invert(only_utf8); - _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); - } - else if (not_utf8) { - - /* If a code point matches iff the target string is not in UTF-8, - * then complementing the result has it not match iff not in UTF-8, - * which is the same thing as matching iff it is UTF-8. */ - only_utf8 = not_utf8; - not_utf8 = NULL; - } - - if (only_utf8_locale) { - _invlist_invert(only_utf8_locale); - _invlist_intersection(only_utf8_locale, - PL_InBitmap, - &only_utf8_locale); - } - - inverted_display = put_charclass_bitmap_innards_common( - invlist, - posixes, - only_utf8, - not_utf8, - only_utf8_locale, trial_invert); - - /* Use the shortest representation, taking into account our bias - * against showing it inverted */ - if ( inverted_display - && ( ! as_is_display - || ( SvCUR(inverted_display) + inverted_bias - < SvCUR(as_is_display) + as_is_bias))) - { - sv_catsv(sv, inverted_display); - invert = ! invert; - } - else if (as_is_display) { - sv_catsv(sv, as_is_display); - } - - SvREFCNT_dec(as_is_display); - SvREFCNT_dec(inverted_display); - } - - SvREFCNT_dec_NN(invlist); - SvREFCNT_dec(only_utf8); - SvREFCNT_dec(not_utf8); - SvREFCNT_dec(posixes); - SvREFCNT_dec(only_utf8_locale); - - U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); - if (did_output_something) { - /* Distinguish between non and inverted cases */ - did_output_something += invert; - } - - return did_output_something; -} - -#define CLEAR_OPTSTART \ - if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ - " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ - } STMT_END - -#define DUMPUNTIL(b,e) \ - CLEAR_OPTSTART; \ - node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); - -STATIC const regnode * -S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, - const regnode *last, const regnode *plast, - SV* sv, I32 indent, U32 depth) -{ - const regnode *next; - const regnode *optstart= NULL; - - RXi_GET_DECL(r, ri); - DECLARE_AND_GET_RE_DEBUG_FLAGS; - - PERL_ARGS_ASSERT_DUMPUNTIL; - -#ifdef DEBUG_DUMPUNTIL - Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, - last ? last-start : 0, plast ? plast-start : 0); -#endif - - if (plast && plast < last) - last= plast; - - while (node && (!last || node < last)) { - const U8 op = OP(node); - - if (op == CLOSE || op == SRCLOSE || op == WHILEM) - indent--; - next = regnext((regnode *)node); - const regnode *after = regnode_after((regnode *)node,0); - - /* Where, what. */ - if (op == OPTIMIZED) { - if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) - optstart = node; - else - goto after_print; - } else - CLEAR_OPTSTART; - - regprop(r, sv, node, NULL, NULL); - Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), - (int)(2*indent + 1), "", SvPVX_const(sv)); - - if (op != OPTIMIZED) { - if (next == NULL) /* Next ptr. */ - Perl_re_printf( aTHX_ " (0)"); - else if (REGNODE_TYPE(op) == BRANCH - && REGNODE_TYPE(OP(next)) != BRANCH ) - Perl_re_printf( aTHX_ " (FAIL)"); - else - Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); - Perl_re_printf( aTHX_ "\n"); - } - - after_print: - if (REGNODE_TYPE(op) == BRANCHJ) { - assert(next); - const regnode *nnode = (OP(next) == LONGJMP - ? regnext((regnode *)next) - : next); - if (last && nnode > last) - nnode = last; - DUMPUNTIL(after, nnode); - } - else if (REGNODE_TYPE(op) == BRANCH) { - assert(next); - DUMPUNTIL(after, next); - } - else if ( REGNODE_TYPE(op) == TRIE ) { - const regnode *this_trie = node; - const U32 n = ARG(node); - const reg_ac_data * const ac = op>=AHOCORASICK ? - (reg_ac_data *)ri->data->data[n] : - NULL; - const reg_trie_data * const trie = - (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; -#ifdef DEBUGGING - AV *const trie_words - = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); -#endif - const regnode *nextbranch= NULL; - I32 word_idx; - SvPVCLEAR(sv); - for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { - SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); - - Perl_re_indentf( aTHX_ "%s ", - indent+3, - elem_ptr - ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), - SvCUR(*elem_ptr), PL_dump_re_max_len, - PL_colors[0], PL_colors[1], - (SvUTF8(*elem_ptr) - ? PERL_PV_ESCAPE_UNI - : 0) - | PERL_PV_PRETTY_ELLIPSES - | PERL_PV_PRETTY_LTGT - ) - : "???" - ); - if (trie->jump) { - U16 dist= trie->jump[word_idx+1]; - Perl_re_printf( aTHX_ "(%" UVuf ")\n", - (UV)((dist ? this_trie + dist : next) - start)); - if (dist) { - if (!nextbranch) - nextbranch= this_trie + trie->jump[0]; - DUMPUNTIL(this_trie + dist, nextbranch); - } - if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) - nextbranch= regnext((regnode *)nextbranch); - } else { - Perl_re_printf( aTHX_ "\n"); - } - } - if (last && next > last) - node= last; - else - node= next; - } - else if ( op == CURLY ) { /* "next" might be very big: optimizer */ - DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ - } - else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { - assert(next); - DUMPUNTIL(after, next); - } - else if ( op == PLUS || op == STAR) { - DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ - } - else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { - /* Literal string, where present. */ - node = (const regnode *)REGNODE_AFTER_varies(node); - } - else { - node = REGNODE_AFTER_opcode(node,op); - } - if (op == CURLYX || op == OPEN || op == SROPEN) - indent++; - if (REGNODE_TYPE(op) == END) - break; - } - CLEAR_OPTSTART; -#ifdef DEBUG_DUMPUNTIL - Perl_re_printf( aTHX_ "--- %d\n", (int)indent); -#endif - return node; -} - -#endif /* DEBUGGING */ - #ifndef PERL_IN_XSUB_RE # include "uni_keywords.h" @@ -23939,7 +13666,7 @@ warning when none was present before might cause breakage, for little gain. So khw left this code in, but not enabled. Tests were never added. embed.fnc entry: -Ei |const char *|get_extended_utf8_msg|const UV cp +Ei |const char *|get_extended_utf8_msg|const UV cp PERL_STATIC_INLINE const char * S_get_extended_utf8_msg(pTHX_ const UV cp) @@ -64,7 +64,7 @@ typedef struct regexp_internal { regnode *regstclass; /* Optional startclass as identified or constructed by the optimiser */ - struct reg_data *data; /* Additional miscellaneous data used by the program. + struct reg_data *data; /* Additional miscellaneous data used by the program. Used to make it easier to clone and free arbitrary data that the regops need. Often the ARG field of a regop is an index into this structure. NOTE the @@ -76,10 +76,10 @@ typedef struct regexp_internal { only valid when RXp_PAREN_NAMES(prog) is true, 0 means "no value" like any other index into the data array.*/ - regnode program[1]; /* Unwarranted chumminess with compiler. */ + regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp_internal; -#define RXi_SET(x,y) (x)->pprivate = (void*)(y) +#define RXi_SET(x,y) (x)->pprivate = (void*)(y) #define RXi_GET(x) ((regexp_internal *)((x)->pprivate)) #define RXi_GET_DECL(r,ri) regexp_internal *ri = RXi_GET(r) #define RXi_GET_DECL_NULL(r,ri) regexp_internal *ri = (r) ? RXi_GET(r) : NULL @@ -92,12 +92,12 @@ typedef struct regexp_internal { #define RXp_INTFLAGS(rx) ((rx)->intflags) #define RX_INTFLAGS(prog) RXp_INTFLAGS(ReANY(prog)) -#define PREGf_SKIP 0x00000001 -#define PREGf_IMPLICIT 0x00000002 /* Converted .* to ^.* */ -#define PREGf_NAUGHTY 0x00000004 /* how exponential is this pattern? */ -#define PREGf_VERBARG_SEEN 0x00000008 -#define PREGf_CUTGROUP_SEEN 0x00000010 -#define PREGf_USE_RE_EVAL 0x00000020 /* compiled with "use re 'eval'" */ +#define PREGf_SKIP 0x00000001 +#define PREGf_IMPLICIT 0x00000002 /* Converted .* to ^.* */ +#define PREGf_NAUGHTY 0x00000004 /* how exponential is this pattern? */ +#define PREGf_VERBARG_SEEN 0x00000008 +#define PREGf_CUTGROUP_SEEN 0x00000010 +#define PREGf_USE_RE_EVAL 0x00000020 /* compiled with "use re 'eval'" */ /* these used to be extflags, but are now intflags */ #define PREGf_NOSCAN 0x00000040 /* spare */ @@ -154,14 +154,14 @@ typedef struct regexp_internal { */ struct regnode_string { - U8 str_len; + U8 str_len; U8 type; U16 next_off; char string[1]; }; struct regnode_lstring { /* Constructed this way to keep the string aligned. */ - U8 flags; + U8 flags; U8 type; U16 next_off; U32 str_len; /* Only 18 bits allowed before would overflow 'next_off' */ @@ -169,17 +169,17 @@ struct regnode_lstring { /* Constructed this way to keep the string aligned. */ }; struct regnode_anyofhs { /* Constructed this way to keep the string aligned. */ - U8 str_len; + U8 str_len; U8 type; U16 next_off; U32 arg1; /* set by set_ANYOF_arg() */ char string[1]; }; -/* Argument bearing node - workhorse, +/* Argument bearing node - workhorse, arg1 is often for the data field */ struct regnode_1 { - U8 flags; + U8 flags; U8 type; U16 next_off; U32 arg1; @@ -201,7 +201,7 @@ struct regnode_1 { * then use inline functions to copy the data in or out. * */ struct regnode_p { - U8 flags; + U8 flags; U8 type; U16 next_off; char arg1_sv_ptr_bytes[sizeof(SV *)]; @@ -209,7 +209,7 @@ struct regnode_p { /* Similar to a regnode_1 but with an extra signed argument */ struct regnode_2L { - U8 flags; + U8 flags; U8 type; U16 next_off; U32 arg1; @@ -218,7 +218,7 @@ struct regnode_2L { /* 'Two field' -- Two 16 bit unsigned args */ struct regnode_2 { - U8 flags; + U8 flags; U8 type; U16 next_off; U16 arg1; @@ -233,13 +233,13 @@ struct regnode_2 { * The array is a bitmap capable of representing any possible continuation * byte. */ struct regnode_bbm { - U8 first_byte; + U8 first_byte; U8 type; U16 next_off; U8 bitmap[REGNODE_BBM_BITMAP_LEN]; }; -#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / CHARBITS) +#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / CHARBITS) /* Note that these form structs which are supersets of the next smaller one, by * appending fields. Alignment problems can occur if one of those optional @@ -254,21 +254,21 @@ struct regnode_bbm { /* also used by trie */ struct regnode_charclass { - U8 flags; + U8 flags; U8 type; U16 next_off; U32 arg1; /* set by set_ANYOF_arg() */ - char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ + char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ }; /* has runtime (locale) \d, \w, ..., [:posix:] classes */ struct regnode_charclass_posixl { - U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ + U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ U8 type; U16 next_off; U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ - U32 classflags; /* and run-time */ + char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ + U32 classflags; /* and run-time */ }; /* A synthetic start class (SSC); is a regnode_charclass_posixl_fold, plus an @@ -285,12 +285,12 @@ struct regnode_charclass_posixl { * never a next node. */ struct regnode_ssc { - U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ + U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */ U8 type; U16 next_off; U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ - U32 classflags; /* ... and run-time */ + char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */ + U32 classflags; /* ... and run-time */ /* Auxiliary, only used during construction; NULL afterwards: list of code * points matched */ @@ -376,15 +376,15 @@ struct regnode_ssc { #undef OPERAND #undef STRING -#define OP(p) ((p)->type) -#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ +#define OP(p) ((p)->type) +#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ regnode types. For some, it's the \ character set of the regnode */ -#define STR_LENs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \ +#define STR_LENs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \ ((struct regnode_string *)p)->str_len) -#define STRINGs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \ +#define STRINGs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \ ((struct regnode_string *)p)->string) -#define OPERANDs(p) STRINGs(p) +#define OPERANDs(p) STRINGs(p) #define PARNO(p) ARG(p) /* APPLIES for OPEN and CLOSE only */ @@ -400,21 +400,21 @@ struct regnode_ssc { * node to be an ARG2L, using the second 32 bit field for the length, and not * using the flags nor next_off fields at all. One could have an llstring node * and even an lllstring type. */ -#define STR_LENl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ +#define STR_LENl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ (((struct regnode_lstring *)p)->str_len)) -#define STRINGl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ +#define STRINGl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ (((struct regnode_lstring *)p)->string)) -#define OPERANDl(p) STRINGl(p) +#define OPERANDl(p) STRINGl(p) -#define STR_LEN(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ +#define STR_LEN(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ ? STR_LENl(p) : STR_LENs(p)) -#define STRING(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ +#define STRING(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \ ? STRINGl(p) : STRINGs(p)) -#define OPERAND(p) STRING(p) +#define OPERAND(p) STRING(p) /* The number of (smallest) regnode equivalents that a string of length l bytes * occupies - Used by the REGNODE_AFTER() macros and functions. */ -#define STR_SZ(l) (((l) + sizeof(regnode) - 1) / sizeof(regnode)) +#define STR_SZ(l) (((l) + sizeof(regnode) - 1) / sizeof(regnode)) #define setSTR_LEN(p,v) \ STMT_START{ \ @@ -431,18 +431,18 @@ struct regnode_ssc { #undef NODE_ALIGN #undef ARG_LOC -#define NODE_ALIGN(node) -#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) +#define NODE_ALIGN(node) +#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) #define ARGp_BYTES_LOC(p) (((struct regnode_p *)p)->arg1_sv_ptr_bytes) -#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) -#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) -#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) +#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) +#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) +#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2) /* These should no longer be used directly in most cases. Please use * the REGNODE_AFTER() macros instead. */ -#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ -#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) +#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ +#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) /* Core macros for computing "the regnode after this one". See also * Perl_regnode_after() in reginline.h @@ -735,7 +735,7 @@ ARGp_SET_inline(struct regnode *node, SV *ptr) { /* If this is set, the result of the match should be complemented. regexec.c * is expecting this to be in the low bit. Never in an SSC */ -#define ANYOF_INVERT 0x01 +#define ANYOF_INVERT 0x01 /* For the SSC node only, which cannot be inverted, so is shared with that bit. * This is used only during regex compilation. */ @@ -799,7 +799,7 @@ ARGp_SET_inline(struct regnode *node, SV *ptr) { #define ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 0x80 #define ANYOF_WARN_SUPER__shared 0x80 -#define ANYOF_FLAGS_ALL ((U8) ~(0x10|0x20)) +#define ANYOF_FLAGS_ALL ((U8) ~(0x10|0x20)) #define ANYOF_LOCALE_FLAGS ( ANYOFL_FOLD \ | ANYOF_MATCHES_POSIXL \ @@ -868,44 +868,44 @@ ARGp_SET_inline(struct regnode *node, SV *ptr) { # error Problem with handy.h CC_foo_ #defines #endif -#define ANYOF_HORIZWS ((ANYOF_POSIXL_MAX)+2) /* = (ANYOF_NVERTWS + 1) */ -#define ANYOF_NHORIZWS ((ANYOF_POSIXL_MAX)+3) +#define ANYOF_HORIZWS ((ANYOF_POSIXL_MAX)+2) /* = (ANYOF_NVERTWS + 1) */ +#define ANYOF_NHORIZWS ((ANYOF_POSIXL_MAX)+3) #define ANYOF_UNIPROP ((ANYOF_POSIXL_MAX)+4) /* Used to indicate a Unicode property: \p{} or \P{} */ /* Backward source code compatibility. */ -#define ANYOF_ALNUML ANYOF_ALNUM -#define ANYOF_NALNUML ANYOF_NALNUM -#define ANYOF_SPACEL ANYOF_SPACE -#define ANYOF_NSPACEL ANYOF_NSPACE +#define ANYOF_ALNUML ANYOF_ALNUM +#define ANYOF_NALNUML ANYOF_NALNUM +#define ANYOF_SPACEL ANYOF_SPACE +#define ANYOF_NSPACEL ANYOF_NSPACE #define ANYOF_ALNUM ANYOF_WORDCHAR #define ANYOF_NALNUM ANYOF_NWORDCHAR /* Utility macros for the bitmap and classes of ANYOF */ -#define BITMAP_BYTE(p, c) (( (U8*) (p)) [ ( ( (UV) (c)) >> 3) ] ) -#define BITMAP_BIT(c) (1U << ((c) & 7)) -#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & BITMAP_BIT((U8)(c))) +#define BITMAP_BYTE(p, c) (( (U8*) (p)) [ ( ( (UV) (c)) >> 3) ] ) +#define BITMAP_BIT(c) (1U << ((c) & 7)) +#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & BITMAP_BIT((U8)(c))) -#define ANYOF_FLAGS(p) ((p)->flags) +#define ANYOF_FLAGS(p) ((p)->flags) -#define ANYOF_BIT(c) BITMAP_BIT(c) +#define ANYOF_BIT(c) BITMAP_BIT(c) #define ANYOF_POSIXL_BITMAP(p) (((regnode_charclass_posixl*) (p))->classflags) -#define POSIXL_SET(field, c) ((field) |= (1U << (c))) -#define ANYOF_POSIXL_SET(p, c) POSIXL_SET(ANYOF_POSIXL_BITMAP(p), (c)) +#define POSIXL_SET(field, c) ((field) |= (1U << (c))) +#define ANYOF_POSIXL_SET(p, c) POSIXL_SET(ANYOF_POSIXL_BITMAP(p), (c)) #define POSIXL_CLEAR(field, c) ((field) &= ~ (1U <<(c))) #define ANYOF_POSIXL_CLEAR(p, c) POSIXL_CLEAR(ANYOF_POSIXL_BITMAP(p), (c)) -#define POSIXL_TEST(field, c) ((field) & (1U << (c))) -#define ANYOF_POSIXL_TEST(p, c) POSIXL_TEST(ANYOF_POSIXL_BITMAP(p), (c)) +#define POSIXL_TEST(field, c) ((field) & (1U << (c))) +#define ANYOF_POSIXL_TEST(p, c) POSIXL_TEST(ANYOF_POSIXL_BITMAP(p), (c)) -#define POSIXL_ZERO(field) STMT_START { (field) = 0; } STMT_END -#define ANYOF_POSIXL_ZERO(ret) POSIXL_ZERO(ANYOF_POSIXL_BITMAP(ret)) +#define POSIXL_ZERO(field) STMT_START { (field) = 0; } STMT_END +#define ANYOF_POSIXL_ZERO(ret) POSIXL_ZERO(ANYOF_POSIXL_BITMAP(ret)) #define ANYOF_POSIXL_SET_TO_BITMAP(p, bits) \ STMT_START { ANYOF_POSIXL_BITMAP(p) = (bits); } STMT_END @@ -938,25 +938,25 @@ ARGp_SET_inline(struct regnode *node, SV *ptr) { #define ANYOF_POSIXL_AND(source, dest) STMT_START { (dest)->classflags &= (source)->classflags ; } STMT_END -#define ANYOF_BITMAP_ZERO(ret) Zero(((regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) -#define ANYOF_BITMAP(p) ((regnode_charclass*)(p))->bitmap -#define ANYOF_BITMAP_BYTE(p, c) BITMAP_BYTE(ANYOF_BITMAP(p), c) -#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) -#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) -#define ANYOF_BITMAP_TEST(p, c) cBOOL(ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) +#define ANYOF_BITMAP_ZERO(ret) Zero(((regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) +#define ANYOF_BITMAP(p) ((regnode_charclass*)(p))->bitmap +#define ANYOF_BITMAP_BYTE(p, c) BITMAP_BYTE(ANYOF_BITMAP(p), c) +#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) +#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) +#define ANYOF_BITMAP_TEST(p, c) cBOOL(ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) -#define ANYOF_BITMAP_SETALL(p) \ +#define ANYOF_BITMAP_SETALL(p) \ memset (ANYOF_BITMAP(p), 255, ANYOF_BITMAP_SIZE) -#define ANYOF_BITMAP_CLEARALL(p) \ +#define ANYOF_BITMAP_CLEARALL(p) \ Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE) /* * Utility definitions. */ #ifndef CHARMASK -# define UCHARAT(p) ((int)*(const U8*)(p)) +# define UCHARAT(p) ((int)*(const U8*)(p)) #else -# define UCHARAT(p) ((int)*(p)&CHARMASK) +# define UCHARAT(p) ((int)*(p)&CHARMASK) #endif /* Number of regnode equivalents that 'guy' occupies beyond the size of the @@ -991,11 +991,11 @@ START_EXTERN_C #ifndef DOINIT EXTCONST regexp_engine PL_core_reg_engine; #else /* DOINIT */ -EXTCONST regexp_engine PL_core_reg_engine = { +EXTCONST regexp_engine PL_core_reg_engine = { Perl_re_compile, Perl_regexec_flags, Perl_re_intuit_start, - Perl_re_intuit_string, + Perl_re_intuit_string, Perl_regfree_internal, Perl_reg_numbered_buff_fetch, Perl_reg_numbered_buff_store, @@ -1003,9 +1003,9 @@ EXTCONST regexp_engine PL_core_reg_engine = { Perl_reg_named_buff, Perl_reg_named_buff_iter, Perl_reg_qr_package, -#if defined(USE_ITHREADS) +#if defined(USE_ITHREADS) Perl_regdupe_internal, -#endif +#endif Perl_re_op_compile }; #endif /* DOINIT */ @@ -1058,10 +1058,10 @@ struct reg_data { #define check_offset_max substrs->data[2].max_offset #define check_end_shift substrs->data[2].end_shift -#define RX_ANCHORED_SUBSTR(rx) (ReANY(rx)->anchored_substr) -#define RX_ANCHORED_UTF8(rx) (ReANY(rx)->anchored_utf8) -#define RX_FLOAT_SUBSTR(rx) (ReANY(rx)->float_substr) -#define RX_FLOAT_UTF8(rx) (ReANY(rx)->float_utf8) +#define RX_ANCHORED_SUBSTR(rx) (ReANY(rx)->anchored_substr) +#define RX_ANCHORED_UTF8(rx) (ReANY(rx)->anchored_utf8) +#define RX_FLOAT_SUBSTR(rx) (ReANY(rx)->float_substr) +#define RX_FLOAT_UTF8(rx) (ReANY(rx)->float_utf8) /* trie related stuff */ @@ -1099,12 +1099,12 @@ struct _reg_trie_state { /* info per word; indexed by wordnum */ typedef struct { - U16 prev; /* previous word in acceptance chain; eg in + U16 prev; /* previous word in acceptance chain; eg in * zzz|abc|ab/ after matching the chars abc, the * accepted word is #2, and the previous accepted * word is #3 */ - U32 len; /* how many chars long is this word? */ - U32 accept; /* accept state for this word */ + U32 len; /* how many chars long is this word? */ + U32 accept; /* accept state for this word */ } reg_trie_wordinfo; @@ -1123,7 +1123,7 @@ struct _reg_trie_data { reg_trie_state *states; /* state data */ reg_trie_trans *trans; /* array of transition elements */ char *bitmap; /* stclass bitmap */ - U16 *jump; /* optional 1 indexed array of offsets before tail + U16 *jump; /* optional 1 indexed array of offsets before tail for the node following a given word. */ reg_trie_wordinfo *wordinfo; /* array of info per word */ U16 uniquecharcount; /* unique chars in trie (width of trans table) */ @@ -1131,7 +1131,7 @@ struct _reg_trie_data { STRLEN minlen; /* minimum length of words in trie - build/opt only? */ STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ U32 prefixlen; /* #chars in common prefix */ - U32 statecount; /* Build only - number of states in the states array + U32 statecount; /* Build only - number of states in the states array (including the unused zero state) */ U32 wordcount; /* Build only */ #ifdef DEBUGGING @@ -1166,11 +1166,11 @@ typedef struct _reg_ac_data reg_ac_data; This is simpler than refactoring all of it as wed end up with three different sets... */ -#define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap) -#define TRIE_BITMAP_BYTE(p, c) BITMAP_BYTE(TRIE_BITMAP(p), c) -#define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c)) -#define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) -#define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) +#define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap) +#define TRIE_BITMAP_BYTE(p, c) BITMAP_BYTE(TRIE_BITMAP(p), c) +#define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c)) +#define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c)) +#define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c)) #define IS_ANYOF_TRIE(op) ((op)==TRIEC || (op)==AHOCORASICKC) #define IS_TRIE_AC(op) ((op)>=AHOCORASICK) @@ -1201,7 +1201,7 @@ The three groups are: Compile, Execute, Extra. There is room for a further group, as currently only the low three bytes are used. Compile Options: - + PARSE PEEP TRIE @@ -1363,7 +1363,7 @@ re.pm, especially to the documentation. #define RE_SV_DUMPLEN(ItEm) (SvCUR(ItEm) - (SvTAIL(ItEm)!=0)) #define RE_SV_TAIL(ItEm) (SvTAIL(ItEm) ? "$" : "") - + #else /* if not DEBUGGING */ #define DECLARE_AND_GET_RE_DEBUG_FLAGS dNOOP @@ -1423,7 +1423,7 @@ typedef enum { #define REGNODE_ARG_LEN_VARIES(node) (PL_regnode_info[(node)].arg_len_varies) #define REGNODE_NAME(node) (PL_regnode_name[(node)]) -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) +#if defined(PERL_IN_REGEX_ENGINE) #include "reginline.h" #endif diff --git a/regcomp.sym b/regcomp.sym index a552cc3ea1..ddc8397daf 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -1,6 +1,6 @@ # regcomp.sym # -# File has two sections, divided by a line of dashes '-'. +# File has two sections, divided by a line of dashes '-'. # # Lines beginning with # are ignored, except for those that start with #* # which are included in pod/perldebguts.pod. # within a line may be part @@ -10,7 +10,7 @@ # # Note that the order in this file is important. # -# Format for first section: +# Format for first section: # NAME \s+ TYPE, arg-description [struct regnode suffix] [flags] [longjump] ; DESCRIPTION # arg-description is currently unused # suffix is appended to 'struct_regnode_' giving which one to use. If empty, @@ -278,8 +278,8 @@ GOSUB GOSUB, num/ofs 2L ; recurse to paren arg1 at (signed) ofs ar #*Special conditionals GROUPPN GROUPPN, no-sv 1 ; Whether the group matched. -INSUBP INSUBP, num 1 ; Whether we are in a specific recurse. -DEFINEP DEFINEP, none 1 ; Never execute directly. +INSUBP INSUBP, num 1 ; Whether we are in a specific recurse. +DEFINEP DEFINEP, none 1 ; Never execute directly. #*Backtracking Verbs ENDLIKE ENDLIKE, none ; Used only for the type field of verbs @@ -288,7 +288,7 @@ ACCEPT ENDLIKE, no-sv/num 2L ; Accepts the current matched string, wit #*Verbs With Arguments VERB VERB, no-sv 1 ; Used only for the type field of verbs -PRUNE VERB, no-sv 1 ; Pattern fails at this startpoint if no-backtracking through this +PRUNE VERB, no-sv 1 ; Pattern fails at this startpoint if no-backtracking through this MARKPOINT VERB, no-sv 1 ; Push the current location for rollback by cut. SKIP VERB, no-sv 1 ; On failure skip forward (to the mark) before retrying COMMIT VERB, no-sv 1 ; Pattern fails outright if backtracking through this diff --git a/regcomp_debug.c b/regcomp_debug.c new file mode 100644 index 0000000000..c7c42f6941 --- /dev/null +++ b/regcomp_debug.c @@ -0,0 +1,1625 @@ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +#include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY +#define PERL_IN_REGCOMP_DEBUG_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "invlist_inline.h" +#include "unicode_constants.h" +#include "regcomp_internal.h" + +#ifdef DEBUGGING + +int +Perl_re_printf(pTHX_ const char *fmt, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_PRINTF; + va_start(ap, fmt); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} + +int +Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_INDENTF; + va_start(ap, depth); + PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} + +void +Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, + const char *close_str) +{ + PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS; + if (!flags) + return; + + Perl_re_printf( aTHX_ "%s", open_str); + DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); + DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); + DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); + DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); + DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); + DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); + DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); + Perl_re_printf( aTHX_ "%s", close_str); +} + +void +Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data, + U32 depth, int is_inf, + SSize_t min, SSize_t stopmin, SSize_t delta) +{ + PERL_ARGS_ASSERT_DEBUG_STUDYDATA; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + DEBUG_OPTIMISE_MORE_r({ + if (!data) + return; + Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, + depth, + where, + min, stopmin, delta, + (IV)data->pos_min, + (IV)data->pos_delta, + (UV)data->flags + ); + + Perl_debug_show_study_flags(aTHX_ data->flags," [","]"); + + Perl_re_printf( aTHX_ + " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", + (IV)data->whilem_c, + (IV)(data->last_closep ? *((data)->last_closep) : -1), + is_inf ? "INF " : "" + ); + + if (data->last_found) { + int i; + Perl_re_printf(aTHX_ + "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, + SvPVX_const(data->last_found), + (IV)data->last_end, + (IV)data->last_start_min, + (IV)data->last_start_max + ); + + for (i = 0; i < 2; i++) { + Perl_re_printf(aTHX_ + " %s%s: '%s' @ %" IVdf "/%" IVdf, + data->cur_is_floating == i ? "*" : "", + i ? "Float" : "Fixed", + SvPVX_const(data->substrs[i].str), + (IV)data->substrs[i].min_offset, + (IV)data->substrs[i].max_offset + ); + Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); + } + } + + Perl_re_printf( aTHX_ "\n"); + }); +} + + +void +Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, + regnode *scan, U32 depth, U32 flags) +{ + PERL_ARGS_ASSERT_DEBUG_PEEP; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + DEBUG_OPTIMISE_r({ + regnode *Next; + + if (!scan) + return; + Next = regnext(scan); + regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", + depth, + str, + REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), + Next ? (REG_NODE_NUM(Next)) : 0 ); + Perl_debug_show_study_flags(aTHX_ flags," [ ","]"); + Perl_re_printf( aTHX_ "\n"); + }); +} + +#endif /* DEBUGGING */ + +/* + - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form + */ +#ifdef DEBUGGING + +static void +S_regdump_intflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + + ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { + if (flags & (1<<bit)) { + if (!set++ && lead) + Perl_re_printf( aTHX_ "%s", lead); + Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); + } + } + if (lead) { + if (set) + Perl_re_printf( aTHX_ "\n"); + else + Perl_re_printf( aTHX_ "%s[none-set]\n", lead); + } +} + +static void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) +{ + int bit; + int set=0; + regex_charset cs; + + ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8); + + for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { + if (flags & (1U<<bit)) { + if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ + continue; + } + if (!set++ && lead) + Perl_re_printf( aTHX_ "%s", lead); + Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); + } + } + if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { + if (!set++ && lead) { + Perl_re_printf( aTHX_ "%s", lead); + } + switch (cs) { + case REGEX_UNICODE_CHARSET: + Perl_re_printf( aTHX_ "UNICODE"); + break; + case REGEX_LOCALE_CHARSET: + Perl_re_printf( aTHX_ "LOCALE"); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); + break; + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); + break; + default: + Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); + break; + } + } + if (lead) { + if (set) + Perl_re_printf( aTHX_ "\n"); + else + Perl_re_printf( aTHX_ "%s[none-set]\n", lead); + } +} +#endif + +void +Perl_regdump(pTHX_ const regexp *r) +{ +#ifdef DEBUGGING + int i; + SV * const sv = sv_newmortal(); + SV *dsv= sv_newmortal(); + RXi_GET_DECL(r, ri); + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_REGDUMP; + + (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); + + /* Header fields of interest. */ + for (i = 0; i < 2; i++) { + if (r->substrs->data[i].substr) { + RE_PV_QUOTED_DECL(s, 0, dsv, + SvPVX_const(r->substrs->data[i].substr), + RE_SV_DUMPLEN(r->substrs->data[i].substr), + PL_dump_re_max_len); + Perl_re_printf( aTHX_ + "%s %s%s at %" IVdf "..%" UVuf " ", + i ? "floating" : "anchored", + s, + RE_SV_TAIL(r->substrs->data[i].substr), + (IV)r->substrs->data[i].min_offset, + (UV)r->substrs->data[i].max_offset); + } + else if (r->substrs->data[i].utf8_substr) { + RE_PV_QUOTED_DECL(s, 1, dsv, + SvPVX_const(r->substrs->data[i].utf8_substr), + RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), + 30); + Perl_re_printf( aTHX_ + "%s utf8 %s%s at %" IVdf "..%" UVuf " ", + i ? "floating" : "anchored", + s, + RE_SV_TAIL(r->substrs->data[i].utf8_substr), + (IV)r->substrs->data[i].min_offset, + (UV)r->substrs->data[i].max_offset); + } + } + + if (r->check_substr || r->check_utf8) + Perl_re_printf( aTHX_ + (const char *) + ( r->check_substr == r->substrs->data[1].substr + && r->check_utf8 == r->substrs->data[1].utf8_substr + ? "(checking floating" : "(checking anchored")); + if (r->intflags & PREGf_NOSCAN) + Perl_re_printf( aTHX_ " noscan"); + if (r->extflags & RXf_CHECK_ALL) + Perl_re_printf( aTHX_ " isall"); + if (r->check_substr || r->check_utf8) + Perl_re_printf( aTHX_ ") "); + + if (ri->regstclass) { + regprop(r, sv, ri->regstclass, NULL, NULL); + Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); + } + if (r->intflags & PREGf_ANCH) { + Perl_re_printf( aTHX_ "anchored"); + if (r->intflags & PREGf_ANCH_MBOL) + Perl_re_printf( aTHX_ "(MBOL)"); + if (r->intflags & PREGf_ANCH_SBOL) + Perl_re_printf( aTHX_ "(SBOL)"); + if (r->intflags & PREGf_ANCH_GPOS) + Perl_re_printf( aTHX_ "(GPOS)"); + Perl_re_printf( aTHX_ " "); + } + if (r->intflags & PREGf_GPOS_SEEN) + Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); + if (r->intflags & PREGf_SKIP) + Perl_re_printf( aTHX_ "plus "); + if (r->intflags & PREGf_IMPLICIT) + Perl_re_printf( aTHX_ "implicit "); + Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); + if (r->extflags & RXf_EVAL_SEEN) + Perl_re_printf( aTHX_ "with eval "); + Perl_re_printf( aTHX_ "\n"); + DEBUG_FLAGS_r({ + regdump_extflags("r->extflags: ", r->extflags); + regdump_intflags("r->intflags: ", r->intflags); + }); +#else + PERL_ARGS_ASSERT_REGDUMP; + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(r); +#endif /* DEBUGGING */ +} + +/* Should be synchronized with ANYOF_ #defines in regcomp.h */ +#ifdef DEBUGGING + +# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ + || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ + || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ + || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ + || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ + || CC_VERTSPACE_ != 15 +# error Need to adjust order of anyofs[] +# endif +static const char * const anyofs[] = { + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" +}; +#endif + +/* +- regprop - printable representation of opcode, with run time support +*/ + +void +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) +{ +#ifdef DEBUGGING + U8 k; + const U8 op = OP(o); + RXi_GET_DECL(prog, progi); + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_REGPROP; + + SvPVCLEAR(sv); + + if (op > REGNODE_MAX) { /* regnode.type is unsigned */ + if (pRExC_state) { /* This gives more info, if we have it */ + FAIL3("panic: corrupted regexp opcode %d > %d", + (int)op, (int)REGNODE_MAX); + } + else { + Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", + (int)op, (int)REGNODE_MAX); + } + } + sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ + + k = REGNODE_TYPE(op); + + if (k == EXACT) { + sv_catpvs(sv, " "); + /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) + * is a crude hack but it may be the best for now since + * we have no flag "this EXACTish node was UTF-8" + * --jhi */ + pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, + PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_ESCAPE_NONASCII | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); + } else if (k == TRIE) { + /* print the details of the trie in dumpuntil instead, as + * progi->data isn't available here */ + const U32 n = ARG(o); + const reg_ac_data * const ac = IS_TRIE_AC(op) ? + (reg_ac_data *)progi->data->data[n] : + NULL; + const reg_trie_data * const trie + = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; + + Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags)); + DEBUG_TRIE_COMPILE_r({ + if (trie->jump) + sv_catpvs(sv, "(JUMP)"); + Perl_sv_catpvf(aTHX_ sv, + "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", + (UV)trie->startstate, + (IV)trie->statecount-1, /* -1 because of the unused 0 element */ + (UV)trie->wordcount, + (UV)trie->minlen, + (UV)trie->maxlen, + (UV)TRIE_CHARCOUNT(trie), + (UV)trie->uniquecharcount + ); + }); + if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { + sv_catpvs(sv, "["); + (void) put_charclass_bitmap_innards(sv, + ((IS_ANYOF_TRIE(op)) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)), + NULL, + NULL, + NULL, + 0, + FALSE + ); + sv_catpvs(sv, "]"); + } + } else if (k == CURLY) { + U32 lo = ARG1(o), hi = ARG2(o); + if (op == CURLYM || op == CURLYN || op == CURLYX) + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); + if (hi == REG_INFTY) + sv_catpvs(sv, "INFTY"); + else + Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); + sv_catpvs(sv, "}"); + } + else if (k == WHILEM && o->flags) /* Ordinal/of */ + Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); + else if (k == REF || k == OPEN || k == CLOSE + || k == GROUPP || op == ACCEPT) + { + AV *name_list= NULL; + U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) : + (op == OPEN || op == CLOSE) ? (U32)PARNO(o) : + (U32)ARG(o); + Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ + if ( RXp_PAREN_NAMES(prog) ) { + name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + } else if ( pRExC_state ) { + name_list= RExC_paren_name_list; + } + if ( name_list ) { + if ( k != REF || (op < REFN)) { + SV **name= av_fetch_simple(name_list, parno, 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); + } + else + if (parno > 0) { + /* parno must always be larger than 0 for this block + * as it represents a slot into the data array, which + * has the 0 slot reserved for a placeholder so any valid + * index into it is always true, eg non-zero + * see the '%' "what" type and the implementation of + * S_reg_add_data() + */ + SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch_simple(name_list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; n<SvIVX(sv_dat); n++ ) { + Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, + (n ? "," : ""), (IV)nums[n]); + } + Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); + } + } + } + if ( k == REF && reginfo) { + U32 n = ARG(o); /* which paren pair */ + I32 ln = prog->offs[n].start; + if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1) + Perl_sv_catpvf(aTHX_ sv, ": FAIL"); + else if (ln == prog->offs[n].end) + Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); + else { + const char *s = reginfo->strbeg + ln; + Perl_sv_catpvf(aTHX_ sv, ": "); + Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0, + PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); + } + } + } else if (k == GOSUB) { + AV *name_list= NULL; + if ( RXp_PAREN_NAMES(prog) ) { + name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + } else if ( pRExC_state ) { + name_list= RExC_paren_name_list; + } + + /* Paren and offset */ + Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), + (int)((o + (int)ARG2L(o)) - progi->program) ); + if (name_list) { + SV **name= av_fetch_simple(name_list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); + } + } + else if (k == LOGICAL) + /* 2: embedded, otherwise 1 */ + Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); + else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { + U8 flags; + char * bitmap; + U8 do_sep = 0; /* Do we need to separate various components of the + output? */ + /* Set if there is still an unresolved user-defined property */ + SV *unresolved = NULL; + + /* Things that are ignored except when the runtime locale is UTF-8 */ + SV *only_utf8_locale_invlist = NULL; + + /* Code points that don't fit in the bitmap */ + SV *nonbitmap_invlist = NULL; + + /* And things that aren't in the bitmap, but are small enough to be */ + SV* bitmap_range_not_in_bitmap = NULL; + + bool inverted; + + if (k != ANYOF) { + flags = 0; + bitmap = NULL; + } + else { + flags = ANYOF_FLAGS(o); + bitmap = ANYOF_BITMAP(o); + } + + if (op == ANYOFL || op == ANYOFPOSIXL) { + if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { + sv_catpvs(sv, "{utf8-locale-reqd}"); + } + if (flags & ANYOFL_FOLD) { + sv_catpvs(sv, "{i}"); + } + } + + inverted = flags & ANYOF_INVERT; + + /* If there is stuff outside the bitmap, get it */ + if (k == ANYOFR) { + + /* For a single range, split into the parts inside vs outside the + * bitmap. */ + UV start = ANYOFRbase(o); + UV end = ANYOFRbase(o) + ANYOFRdelta(o); + + if (start < NUM_ANYOF_CODE_POINTS) { + if (end < NUM_ANYOF_CODE_POINTS) { + bitmap_range_not_in_bitmap + = _add_range_to_invlist(bitmap_range_not_in_bitmap, + start, end); + } + else { + bitmap_range_not_in_bitmap + = _add_range_to_invlist(bitmap_range_not_in_bitmap, + start, NUM_ANYOF_CODE_POINTS); + start = NUM_ANYOF_CODE_POINTS; + } + } + + if (start >= NUM_ANYOF_CODE_POINTS) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + ANYOFRbase(o), + ANYOFRbase(o) + ANYOFRdelta(o)); + } + } + else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + NUM_ANYOF_CODE_POINTS, + UV_MAX); + } + else if (ANYOF_HAS_AUX(o)) { + (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, + &unresolved, + &only_utf8_locale_invlist, + &nonbitmap_invlist); + + /* The aux data may contain stuff that could fit in the bitmap. + * This could come from a user-defined property being finally + * resolved when this call was done; or much more likely because + * there are matches that require UTF-8 to be valid, and so aren't + * in the bitmap (or ANYOFR). This is teased apart later */ + _invlist_intersection(nonbitmap_invlist, + PL_InBitmap, + &bitmap_range_not_in_bitmap); + /* Leave just the things that don't fit into the bitmap */ + _invlist_subtract(nonbitmap_invlist, + PL_InBitmap, + &nonbitmap_invlist); + } + + /* Ready to start outputting. First, the initial left bracket */ + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + if ( bitmap + || bitmap_range_not_in_bitmap + || only_utf8_locale_invlist + || unresolved) + { + /* Then all the things that could fit in the bitmap */ + do_sep = put_charclass_bitmap_innards( + sv, + bitmap, + bitmap_range_not_in_bitmap, + only_utf8_locale_invlist, + o, + flags, + + /* Can't try inverting for a + * better display if there + * are things that haven't + * been resolved */ + (unresolved != NULL || k == ANYOFR)); + SvREFCNT_dec(bitmap_range_not_in_bitmap); + + /* If there are user-defined properties which haven't been defined + * yet, output them. If the result is not to be inverted, it is + * clearest to output them in a separate [] from the bitmap range + * stuff. If the result is to be complemented, we have to show + * everything in one [], as the inversion applies to the whole + * thing. Use {braces} to separate them from anything in the + * bitmap and anything above the bitmap. */ + if (unresolved) { + if (inverted) { + if (! do_sep) { /* If didn't output anything in the bitmap + */ + sv_catpvs(sv, "^"); + } + sv_catpvs(sv, "{"); + } + else if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], + PL_colors[0]); + } + sv_catsv(sv, unresolved); + if (inverted) { + sv_catpvs(sv, "}"); + } + do_sep = ! inverted; + } + else if ( do_sep == 2 + && ! nonbitmap_invlist + && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) + { + /* Here, the display shows the class as inverted, and + * everything above the lower display should also match, but + * there is no indication of that. Add this range so the code + * below will add it to the display */ + _invlist_union_complement_2nd(nonbitmap_invlist, + PL_InBitmap, + &nonbitmap_invlist); + } + } + + /* And, finally, add the above-the-bitmap stuff */ + if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { + SV* contents; + + /* See if truncation size is overridden */ + const STRLEN dump_len = (PL_dump_re_max_len > 256) + ? PL_dump_re_max_len + : 256; + + /* This is output in a separate [] */ + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); + } + + /* And, for easy of understanding, it is shown in the + * uncomplemented form if possible. The one exception being if + * there are unresolved items, where the inversion has to be + * delayed until runtime */ + if (inverted && ! unresolved) { + _invlist_invert(nonbitmap_invlist); + _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); + } + + contents = invlist_contents(nonbitmap_invlist, + FALSE /* output suitable for catsv */ + ); + + /* If the output is shorter than the permissible maximum, just do it. */ + if (SvCUR(contents) <= dump_len) { + sv_catsv(sv, contents); + } + else { + const char * contents_string = SvPVX(contents); + STRLEN i = dump_len; + + /* Otherwise, start at the permissible max and work back to the + * first break possibility */ + while (i > 0 && contents_string[i] != ' ') { + i--; + } + if (i == 0) { /* Fail-safe. Use the max if we couldn't + find a legal break */ + i = dump_len; + } + + sv_catpvn(sv, contents_string, i); + sv_catpvs(sv, "..."); + } + + SvREFCNT_dec_NN(contents); + SvREFCNT_dec_NN(nonbitmap_invlist); + } + + /* And finally the matching, closing ']' */ + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + if (op == ANYOFHs) { + Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); + } + else if (REGNODE_TYPE(op) != ANYOF) { + U8 lowest = (op != ANYOFHr) + ? FLAGS(o) + : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); + U8 highest = (op == ANYOFHr) + ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) + : (op == ANYOFH || op == ANYOFR) + ? 0xFF + : lowest; +#ifndef EBCDIC + if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) +#endif + { + Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); + if (lowest != highest) { + Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); + } + Perl_sv_catpvf(aTHX_ sv, ")"); + } + } + + SvREFCNT_dec(unresolved); + } + else if (k == ANYOFM) { + SV * cp_list = get_ANYOFM_contents(o); + + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + if (op == NANYOFM) { + _invlist_invert(cp_list); + } + + put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(cp_list); + } + else if (k == ANYOFHbbm) { + SV * cp_list = get_ANYOFHbbm_contents(o); + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + sv_catsv(sv, invlist_contents(cp_list, + FALSE /* output suitable for catsv */ + )); + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(cp_list); + } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index < C_ARRAY_LENGTH(anyofs)) { + if (*anyofs[index] != '[') { + sv_catpvs(sv, "["); + } + sv_catpv(sv, anyofs[index]); + if (*anyofs[index] != '[') { + sv_catpvs(sv, "]"); + } + } + else { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + } + else if (k == BOUND || k == NBOUND) { + /* Must be synced with order of 'bound_type' in regcomp.h */ + const char * const bounds[] = { + "", /* Traditional */ + "{gcb}", + "{lb}", + "{sb}", + "{wb}" + }; + assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); + sv_catpv(sv, bounds[FLAGS(o)]); + } + else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { + Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); + if (o->next_off) { + Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); + } + Perl_sv_catpvf(aTHX_ sv, "]"); + } + else if (op == SBOL) + Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); + + /* add on the verb argument if there is one */ + if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) { + if ( ARG(o) ) + Perl_sv_catpvf(aTHX_ sv, ":%" SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + else + sv_catpvs(sv, ":NULL"); + } +#else + PERL_UNUSED_CONTEXT; + PERL_UNUSED_ARG(sv); + PERL_UNUSED_ARG(o); + PERL_UNUSED_ARG(prog); + PERL_UNUSED_ARG(reginfo); + PERL_UNUSED_ARG(pRExC_state); +#endif /* DEBUGGING */ +} + +#ifdef DEBUGGING + +STATIC void +S_put_code_point(pTHX_ SV *sv, UV c) +{ + PERL_ARGS_ASSERT_PUT_CODE_POINT; + + if (c > 255) { + Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); + } + else if (isPRINT(c)) { + const char string = (char) c; + + /* We use {phrase} as metanotation in the class, so also escape literal + * braces */ + if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } + else if (isMNEMONIC_CNTRL(c)) { + Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); + } + else { + Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); + } +} + +STATIC void +S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) +{ + /* Appends to 'sv' a displayable version of the range of code points from + * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls + * that have them, when they occur at the beginning or end of the range. + * It uses hex to output the remaining code points, unless 'allow_literals' + * is true, in which case the printable ASCII ones are output as-is (though + * some of these will be escaped by put_code_point()). + * + * NOTE: This is designed only for printing ranges of code points that fit + * inside an ANYOF bitmap. Higher code points are simply suppressed + */ + + const unsigned int min_range_count = 3; + + assert(start <= end); + + PERL_ARGS_ASSERT_PUT_RANGE; + + while (start <= end) { + UV this_end; + const char * format; + + if ( end - start < min_range_count + && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) + { + /* Output a range of 1 or 2 chars individually, or longer ranges + * when printable */ + for (; start <= end; start++) { + put_code_point(sv, start); + } + break; + } + + /* If permitted by the input options, and there is a possibility that + * this range contains a printable literal, look to see if there is + * one. */ + if (allow_literals && start <= MAX_PRINT_A) { + + /* If the character at the beginning of the range isn't an ASCII + * printable, effectively split the range into two parts: + * 1) the portion before the first such printable, + * 2) the rest + * and output them separately. */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + + /* There is no point looking beyond the final possible + * printable, in MAX_PRINT_A */ + UV max = MIN(end, MAX_PRINT_A); + + while (temp_end <= max && ! isPRINT_A(temp_end)) { + temp_end++; + } + + /* Here, temp_end points to one beyond the first printable if + * found, or to one beyond 'max' if not. If none found, make + * sure that we use the entire range */ + if (temp_end > MAX_PRINT_A) { + temp_end = end + 1; + } + + /* Output the first part of the split range: the part that + * doesn't have printables, with the parameter set to not look + * for literals (otherwise we would infinitely recurse) */ + put_range(sv, start, temp_end - 1, FALSE); + + /* The 2nd part of the range (if any) starts here. */ + start = temp_end; + + /* We do a continue, instead of dropping down, because even if + * the 2nd part is non-empty, it could be so short that we want + * to output it as individual characters, as tested for at the + * top of this loop. */ + continue; + } + + /* Here, 'start' is a printable ASCII. If it is an alphanumeric, + * output a sub-range of just the digits or letters, then process + * the remaining portion as usual. */ + if (isALPHANUMERIC_A(start)) { + UV mask = (isDIGIT_A(start)) + ? CC_DIGIT_ + : isUPPER_A(start) + ? CC_UPPER_ + : CC_LOWER_; + UV temp_end = start + 1; + + /* Find the end of the sub-range that includes just the + * characters in the same class as the first character in it */ + while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { + temp_end++; + } + temp_end--; + + /* For short ranges, don't duplicate the code above to output + * them; just call recursively */ + if (temp_end - start < min_range_count) { + put_range(sv, start, temp_end, FALSE); + } + else { /* Output as a range */ + put_code_point(sv, start); + sv_catpvs(sv, "-"); + put_code_point(sv, temp_end); + } + start = temp_end + 1; + continue; + } + + /* We output any other printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) + || isSPACE_A(start))) + { + put_code_point(sv, start); + start++; + } + continue; + } + } /* End of looking for literals */ + + /* Here is not to output as a literal. Some control characters have + * mnemonic names. Split off any of those at the beginning and end of + * the range to print mnemonically. It isn't possible for many of + * these to be in a row, so this won't overwhelm with output */ + if ( start <= end + && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) + { + while (isMNEMONIC_CNTRL(start) && start <= end) { + put_code_point(sv, start); + start++; + } + + /* If this didn't take care of the whole range ... */ + if (start <= end) { + + /* Look backwards from the end to find the final non-mnemonic + * */ + UV temp_end = end; + while (isMNEMONIC_CNTRL(temp_end)) { + temp_end--; + } + + /* And separately output the interior range that doesn't start + * or end with mnemonics */ + put_range(sv, start, temp_end, FALSE); + + /* Then output the mnemonic trailing controls */ + start = temp_end + 1; + while (start <= end) { + put_code_point(sv, start); + start++; + } + break; + } + } + + /* As a final resort, output the range or subrange as hex. */ + + if (start >= NUM_ANYOF_CODE_POINTS) { + this_end = end; + } + else { /* Have to split range at the bitmap boundary */ + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; + } +#if NUM_ANYOF_CODE_POINTS > 256 + format = (this_end < 256) + ? "\\x%02" UVXf "-\\x%02" UVXf + : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; +#else + format = "\\x%02" UVXf "-\\x%02" UVXf; +#endif + GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); + Perl_sv_catpvf(aTHX_ sv, format, start, this_end); + GCC_DIAG_RESTORE_STMT; + break; + } +} + +STATIC void +S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) +{ + /* Concatenate onto the PV in 'sv' a displayable form of the inversion list + * 'invlist' */ + + UV start, end; + bool allow_literals = TRUE; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; + + /* Generally, it is more readable if printable characters are output as + * literals, but if a range (nearly) spans all of them, it's best to output + * it as a single range. This code will use a single range if all but 2 + * ASCII printables are in it */ + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + + /* If the range starts beyond the final printable, it doesn't have any + * in it */ + if (start > MAX_PRINT_A) { + break; + } + + /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span + * all but two, the range must start and end no later than 2 from + * either end */ + if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { + if (end > MAX_PRINT_A) { + end = MAX_PRINT_A; + } + if (start < ' ') { + start = ' '; + } + if (end - start >= MAX_PRINT_A - ' ' - 2) { + allow_literals = FALSE; + } + break; + } + } + invlist_iterfinish(invlist); + + /* Here we have figured things out. Output each range */ + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (start >= NUM_ANYOF_CODE_POINTS) { + break; + } + put_range(sv, start, end, allow_literals); + } + invlist_iterfinish(invlist); + + return; +} + +STATIC SV* +S_put_charclass_bitmap_innards_common(pTHX_ + SV* invlist, /* The bitmap */ + SV* posixes, /* Under /l, things like [:word:], \S */ + SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ + SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ + SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ + const bool invert /* Is the result to be inverted? */ +) +{ + /* Create and return an SV containing a displayable version of the bitmap + * and associated information determined by the input parameters. If the + * output would have been only the inversion indicator '^', NULL is instead + * returned. */ + + SV * output; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; + + if (invert) { + output = newSVpvs("^"); + } + else { + output = newSVpvs(""); + } + + /* First, the code points in the bitmap that are unconditionally there */ + put_charclass_bitmap_innards_invlist(output, invlist); + + /* Traditionally, these have been placed after the main code points */ + if (posixes) { + sv_catsv(output, posixes); + } + + if (only_utf8 && _invlist_len(only_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8); + } + + if (not_utf8 && _invlist_len(not_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, not_utf8); + } + + if (only_utf8_locale && _invlist_len(only_utf8_locale)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8_locale); + + /* This is the only list in this routine that can legally contain code + * points outside the bitmap range. The call just above to + * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so + * output them here. There's about a half-dozen possible, and none in + * contiguous ranges longer than 2 */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + UV start, end; + SV* above_bitmap = NULL; + + _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); + + invlist_iterinit(above_bitmap); + while (invlist_iternext(above_bitmap, &start, &end)) { + UV i; + + for (i = start; i <= end; i++) { + put_code_point(output, i); + } + } + invlist_iterfinish(above_bitmap); + SvREFCNT_dec_NN(above_bitmap); + } + } + + if (invert && SvCUR(output) == 1) { + return NULL; + } + + return output; +} + +STATIC U8 +S_put_charclass_bitmap_innards(pTHX_ SV *sv, + char *bitmap, + SV *nonbitmap_invlist, + SV *only_utf8_locale_invlist, + const regnode * const node, + const U8 flags, + const bool force_as_is_display) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class defined by the other arguments: + * 'bitmap' points to the bitmap, or NULL if to ignore that. + * 'nonbitmap_invlist' is an inversion list of the code points that are in + * the bitmap range, but for some reason aren't in the bitmap; NULL if + * none. The reasons for this could be that they require some + * condition such as the target string being or not being in UTF-8 + * (under /d), or because they came from a user-defined property that + * was not resolved at the time of the regex compilation (under /u) + * 'only_utf8_locale_invlist' is an inversion list of the code points that + * are valid only if the runtime locale is a UTF-8 one; NULL if none + * 'node' is the regex pattern ANYOF node. It is needed only when the + * above two parameters are not null, and is passed so that this + * routine can tease apart the various reasons for them. + * 'flags' is the flags field of 'node' + * 'force_as_is_display' is TRUE if this routine should definitely NOT try + * to invert things to see if that leads to a cleaner display. If + * FALSE, this routine is free to use its judgment about doing this. + * + * It returns 0 if nothing was actually output. (It may be that + * the bitmap, etc is empty.) + * 1 if the output wasn't inverted (didn't begin with a '^') + * 2 if the output was inverted (did begin with a '^') + * + * When called for outputting the bitmap of a non-ANYOF node, just pass the + * bitmap, with the succeeding parameters set to NULL, and the final one to + * FALSE. + */ + + /* In general, it tries to display the 'cleanest' representation of the + * innards, choosing whether to display them inverted or not, regardless of + * whether the class itself is to be inverted. However, there are some + * cases where it can't try inverting, as what actually matches isn't known + * until runtime, and hence the inversion isn't either. */ + + bool inverting_allowed = ! force_as_is_display; + + int i; + STRLEN orig_sv_cur = SvCUR(sv); + + SV* invlist; /* Inversion list we accumulate of code points that + are unconditionally matched */ + SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is + UTF-8 */ + SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 + */ + SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ + SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale + is UTF-8 */ + + SV* as_is_display; /* The output string when we take the inputs + literally */ + SV* inverted_display; /* The output string when we invert the inputs */ + + bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted + to match? */ + /* We are biased in favor of displaying things without them being inverted, + * as that is generally easier to understand */ + const int bias = 5; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; + + /* Start off with whatever code points are passed in. (We clone, so we + * don't change the caller's list) */ + if (nonbitmap_invlist) { + assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); + invlist = invlist_clone(nonbitmap_invlist, NULL); + } + else { /* Worst case size is every other code point is matched */ + invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); + } + + if (flags) { + if (OP(node) == ANYOFD) { + + /* This flag indicates that the code points below 0x100 in the + * nonbitmap list are precisely the ones that match only when the + * target is UTF-8 (they should all be non-ASCII). */ + if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { + _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); + _invlist_subtract(invlist, only_utf8, &invlist); + } + + /* And this flag for matching all non-ASCII 0xFF and below */ + if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { + not_utf8 = invlist_clone(PL_UpperLatin1, NULL); + } + } + else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { + + /* If either of these flags are set, what matches isn't + * determinable except during execution, so don't know enough here + * to invert */ + if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { + inverting_allowed = FALSE; + } + + /* What the posix classes match also varies at runtime, so these + * will be output symbolically. */ + if (ANYOF_POSIXL_TEST_ANY_SET(node)) { + int i; + + posixes = newSVpvs(""); + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(node, i)) { + sv_catpv(posixes, anyofs[i]); + } + } + } + } + } + + /* Accumulate the bit map into the unconditional match list */ + if (bitmap) { + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + for (; + i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); + i++) + { /* empty */ } + invlist = _add_range_to_invlist(invlist, start, i-1); + } + } + } + + /* Make sure that the conditional match lists don't have anything in them + * that match unconditionally; otherwise the output is quite confusing. + * This could happen if the code that populates these misses some + * duplication. */ + if (only_utf8) { + _invlist_subtract(only_utf8, invlist, &only_utf8); + } + if (not_utf8) { + _invlist_subtract(not_utf8, invlist, ¬_utf8); + } + + if (only_utf8_locale_invlist) { + + /* Since this list is passed in, we have to make a copy before + * modifying it */ + only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); + + _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); + + /* And, it can get really weird for us to try outputting an inverted + * form of this list when it has things above the bitmap, so don't even + * try */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + inverting_allowed = FALSE; + } + } + + /* Calculate what the output would be if we take the input as-is */ + as_is_display = put_charclass_bitmap_innards_common(invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, + invert); + + /* If have to take the output as-is, just do that */ + if (! inverting_allowed) { + if (as_is_display) { + sv_catsv(sv, as_is_display); + SvREFCNT_dec_NN(as_is_display); + } + } + else { /* But otherwise, create the output again on the inverted input, and + use whichever version is shorter */ + + int inverted_bias, as_is_bias; + + /* We will apply our bias to whichever of the results doesn't have + * the '^' */ + bool trial_invert; + if (invert) { + trial_invert = FALSE; + as_is_bias = bias; + inverted_bias = 0; + } + else { + trial_invert = TRUE; + as_is_bias = 0; + inverted_bias = bias; + } + + /* Now invert each of the lists that contribute to the output, + * excluding from the result things outside the possible range */ + + /* For the unconditional inversion list, we have to add in all the + * conditional code points, so that when inverted, they will be gone + * from it */ + _invlist_union(only_utf8, invlist, &invlist); + _invlist_union(not_utf8, invlist, &invlist); + _invlist_union(only_utf8_locale, invlist, &invlist); + _invlist_invert(invlist); + _invlist_intersection(invlist, PL_InBitmap, &invlist); + + if (only_utf8) { + _invlist_invert(only_utf8); + _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); + } + else if (not_utf8) { + + /* If a code point matches iff the target string is not in UTF-8, + * then complementing the result has it not match iff not in UTF-8, + * which is the same thing as matching iff it is UTF-8. */ + only_utf8 = not_utf8; + not_utf8 = NULL; + } + + if (only_utf8_locale) { + _invlist_invert(only_utf8_locale); + _invlist_intersection(only_utf8_locale, + PL_InBitmap, + &only_utf8_locale); + } + + inverted_display = put_charclass_bitmap_innards_common( + invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, trial_invert); + + /* Use the shortest representation, taking into account our bias + * against showing it inverted */ + if ( inverted_display + && ( ! as_is_display + || ( SvCUR(inverted_display) + inverted_bias + < SvCUR(as_is_display) + as_is_bias))) + { + sv_catsv(sv, inverted_display); + invert = ! invert; + } + else if (as_is_display) { + sv_catsv(sv, as_is_display); + } + + SvREFCNT_dec(as_is_display); + SvREFCNT_dec(inverted_display); + } + + SvREFCNT_dec_NN(invlist); + SvREFCNT_dec(only_utf8); + SvREFCNT_dec(not_utf8); + SvREFCNT_dec(posixes); + SvREFCNT_dec(only_utf8_locale); + + U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); + if (did_output_something) { + /* Distinguish between non and inverted cases */ + did_output_something += invert; + } + + return did_output_something; +} + + +const regnode * +Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, + const regnode *last, const regnode *plast, + SV* sv, I32 indent, U32 depth) +{ + const regnode *next; + const regnode *optstart= NULL; + + RXi_GET_DECL(r, ri); + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_DUMPUNTIL; + +#ifdef DEBUG_DUMPUNTIL + Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, + last ? last-start : 0, plast ? plast-start : 0); +#endif + + if (plast && plast < last) + last= plast; + + while (node && (!last || node < last)) { + const U8 op = OP(node); + + if (op == CLOSE || op == SRCLOSE || op == WHILEM) + indent--; + next = regnext((regnode *)node); + const regnode *after = regnode_after((regnode *)node,0); + + /* Where, what. */ + if (op == OPTIMIZED) { + if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) + optstart = node; + else + goto after_print; + } else + CLEAR_OPTSTART; + + regprop(r, sv, node, NULL, NULL); + Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), + (int)(2*indent + 1), "", SvPVX_const(sv)); + + if (op != OPTIMIZED) { + if (next == NULL) /* Next ptr. */ + Perl_re_printf( aTHX_ " (0)"); + else if (REGNODE_TYPE(op) == BRANCH + && REGNODE_TYPE(OP(next)) != BRANCH ) + Perl_re_printf( aTHX_ " (FAIL)"); + else + Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); + Perl_re_printf( aTHX_ "\n"); + } + + after_print: + if (REGNODE_TYPE(op) == BRANCHJ) { + assert(next); + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); + if (last && nnode > last) + nnode = last; + DUMPUNTIL(after, nnode); + } + else if (REGNODE_TYPE(op) == BRANCH) { + assert(next); + DUMPUNTIL(after, next); + } + else if ( REGNODE_TYPE(op) == TRIE ) { + const regnode *this_trie = node; + const U32 n = ARG(node); + const reg_ac_data * const ac = op>=AHOCORASICK ? + (reg_ac_data *)ri->data->data[n] : + NULL; + const reg_trie_data * const trie = + (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; +#ifdef DEBUGGING + AV *const trie_words + = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); +#endif + const regnode *nextbranch= NULL; + I32 word_idx; + SvPVCLEAR(sv); + for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { + SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); + + Perl_re_indentf( aTHX_ "%s ", + indent+3, + elem_ptr + ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), + SvCUR(*elem_ptr), PL_dump_re_max_len, + PL_colors[0], PL_colors[1], + (SvUTF8(*elem_ptr) + ? PERL_PV_ESCAPE_UNI + : 0) + | PERL_PV_PRETTY_ELLIPSES + | PERL_PV_PRETTY_LTGT + ) + : "???" + ); + if (trie->jump) { + U16 dist= trie->jump[word_idx+1]; + Perl_re_printf( aTHX_ "(%" UVuf ")\n", + (UV)((dist ? this_trie + dist : next) - start)); + if (dist) { + if (!nextbranch) + nextbranch= this_trie + trie->jump[0]; + DUMPUNTIL(this_trie + dist, nextbranch); + } + if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) + nextbranch= regnext((regnode *)nextbranch); + } else { + Perl_re_printf( aTHX_ "\n"); + } + } + if (last && next > last) + node= last; + else + node= next; + } + else if ( op == CURLY ) { /* "next" might be very big: optimizer */ + DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ + } + else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { + assert(next); + DUMPUNTIL(after, next); + } + else if ( op == PLUS || op == STAR) { + DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ + } + else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { + /* Literal string, where present. */ + node = (const regnode *)REGNODE_AFTER_varies(node); + } + else { + node = REGNODE_AFTER_opcode(node,op); + } + if (op == CURLYX || op == OPEN || op == SROPEN) + indent++; + if (REGNODE_TYPE(op) == END) + break; + } + CLEAR_OPTSTART; +#ifdef DEBUG_DUMPUNTIL + Perl_re_printf( aTHX_ "--- %d\n", (int)indent); +#endif + return node; +} + +#endif /* DEBUGGING */ diff --git a/regcomp_internal.h b/regcomp_internal.h new file mode 100644 index 0000000000..f8682f823d --- /dev/null +++ b/regcomp_internal.h @@ -0,0 +1,1196 @@ +#ifndef REGCOMP_INTERNAL_H +#define REGCOMP_INTERNAL_H +#ifndef STATIC +#define STATIC static +#endif + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last_regnode; /* last node to process in this frame */ + regnode *next_regnode; /* next node to process when last is reached */ + U32 prev_recursed_depth; + I32 stopparen; /* what stopparen do we use */ + bool in_gosub; /* this or an outer frame is for GOSUB */ + + struct scan_frame *this_prev_frame; /* this previous frame */ + struct scan_frame *prev_frame; /* previous frame */ + struct scan_frame *next_frame; /* next frame */ +} scan_frame; + +/* Certain characters are output as a sequence with the first being a + * backslash. */ +#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + char *precomp_end; /* pointer to end of uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + char *copy_start; /* start of copy of input within + constructed parse string */ + char *save_copy_start; /* Provides one level of saving + and restoring 'copy_start' */ + char *copy_start_in_input; /* Position in input string + corresponding to copy_start */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode_offset emit; /* Code-emit pointer */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + SSize_t size; /* Number of regnode equivalents in + pattern */ + Size_t sets_depth; /* Counts recursion depth of already- + compiled regex set patterns */ + U32 seen; + + I32 parens_buf_size; /* #slots malloced open/close_parens */ + regnode_offset *open_parens; /* offsets to open parens */ + regnode_offset *close_parens; /* offsets to close parens */ + HV *paren_names; /* Paren names */ + + /* position beyond 'precomp' of the warning message furthest away from + * 'precomp'. During the parse, no warnings are raised for any problems + * earlier in the parse than this position. This works if warnings are + * raised the first time a given spot is parsed, and if only one + * independent warning is raised for any given spot */ + Size_t latest_warn_offset; + + I32 npar; /* Capture buffer count so far in the + parse, (OPEN) plus one. ("par" 0 is + the whole pattern)*/ + I32 total_par; /* During initial parse, is either 0, + or -1; the latter indicating a + reparse is needed. After that pass, + it is what 'npar' became after the + pass. Hence, it being > 0 indicates + we are in a reparse situation */ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 seen_zerolen; + regnode *end_op; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + + I32 recurse_count; /* Number of recurse regops we have generated */ + regnode **recurse; /* Recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which subs we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookaround; + I32 contains_locale; + I32 override_recoding; + I32 recode_x_to_native; + I32 in_multi_char_class; + int code_index; /* next code_blocks[] slot */ + struct reg_code_blocks *code_blocks;/* positions of literal (?{}) + within pattern */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ + scan_frame *frame_head; + scan_frame *frame_last; + U32 frame_count; + AV *warn_text; + HV *unlexed_names; + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + U32 study_chunk_recursed_count; + AV *paren_name_list; /* idx -> name */ + SV *mysv1; + SV *mysv2; + +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) +#define RExC_mysv (pRExC_state->mysv1) +#define RExC_mysv1 (pRExC_state->mysv1) +#define RExC_mysv2 (pRExC_state->mysv2) + +#endif + bool seen_d_op; + bool strict; + bool study_started; + bool in_script_run; + bool use_BRANCHJ; + bool sWARN_EXPERIMENTAL__VLB; + bool sWARN_EXPERIMENTAL__REGEX_SETS; +}; + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) +#define RExC_copy_start_in_constructed (pRExC_state->copy_start) +#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) +#define RExC_precomp_end (pRExC_state->precomp_end) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs + under /d from /u ? */ + +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_npar (pRExC_state->npar) +#define RExC_total_parens (pRExC_state->total_par) +#define RExC_parens_buf_size (pRExC_state->parens_buf_size) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_end_op (pRExC_state->end_op) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_sets_depth (pRExC_state->sets_depth) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookaround (pRExC_state->in_lookaround) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) + +#ifdef EBCDIC +# define SET_recode_x_to_native(x) \ + STMT_START { RExC_recode_x_to_native = (x); } STMT_END +#else +# define SET_recode_x_to_native(x) NOOP +#endif + +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) +#define RExC_frame_head (pRExC_state->frame_head) +#define RExC_frame_last (pRExC_state->frame_last) +#define RExC_frame_count (pRExC_state->frame_count) +#define RExC_strict (pRExC_state->strict) +#define RExC_study_started (pRExC_state->study_started) +#define RExC_warn_text (pRExC_state->warn_text) +#define RExC_in_script_run (pRExC_state->in_script_run) +#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) +#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) +#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) +#define RExC_unlexed_names (pRExC_state->unlexed_names) + + +/***********************************************************************/ +/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse + * + * All of these macros depend on the above RExC_ accessor macros, which + * in turns depend on a variable pRExC_state being in scope where they + * are used. This is the standard regexp parser context variable which is + * passed into every non-trivial parse function in this file. + * + * Note that the UTF macro is itself a wrapper around RExC_utf8, so all + * of the macros which do not take an argument will operate on the + * pRExC_state structure *only*. + * + * Please do NOT modify RExC_parse without using these macros. In the + * future these macros will be extended for enhanced debugging and trace + * output during the parse process. + */ + +/* RExC_parse_incf(flag) + * + * Increment RExC_parse to point at the next codepoint, while doing + * the right thing depending on whether we are parsing UTF-8 strings + * or not. The 'flag' argument determines if content is UTF-8 or not, + * intended for cases where this is NOT governed by the UTF macro. + * + * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. + * + * WARNING: Does NOT take into account RExC_end; it is the callers + * responsibility to make sure there are enough octets left in + * RExC_parse to ensure that when processing UTF-8 we would not read + * past the end of the string. + */ +#define RExC_parse_incf(flag) STMT_START { \ + RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ +} STMT_END + +/* RExC_parse_inc_safef(flag) + * + * Safely increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not and NOT reading past the end of the buffer. + * The 'flag' argument determines if content is UTF-8 or not, + * intended for cases where this is NOT governed by the UTF macro. + * + * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. + * + * NOTE: Will NOT read past RExC_end when content is UTF-8. + */ +#define RExC_parse_inc_safef(flag) STMT_START { \ + RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ +} STMT_END + +/* RExC_parse_inc() + * + * Increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not. + * + * WARNING: Does NOT take into account RExC_end, it is the callers + * responsibility to make sure there are enough octets left in + * RExC_parse to ensure that when processing UTF-8 we would not read + * past the end of the string. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc() RExC_parse_incf(UTF) + +/* RExC_parse_inc_safe() + * + * Safely increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not and NOT reading past the end of the buffer. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) + +/* RExC_parse_inc_utf8() + * + * Increment RExC_parse to point at the next utf8 codepoint, + * assumes content is UTF-8. + * + * WARNING: Does NOT take into account RExC_end; it is the callers + * responsibility to make sure there are enough octets left in RExC_parse + * to ensure that when processing UTF-8 we would not read past the end + * of the string. + */ +#define RExC_parse_inc_utf8() STMT_START { \ + RExC_parse += UTF8SKIP(RExC_parse); \ +} STMT_END + +/* RExC_parse_inc_if_char() + * + * Increment RExC_parse to point at the next codepoint, if and only + * if the current parse point is NOT a NULL, while doing the right thing + * depending on whether we are parsing UTF-8 strings or not. + * + * WARNING: Does NOT take into account RExC_end, it is the callers + * responsibility to make sure there are enough octets left in RExC_parse + * to ensure that when processing UTF-8 we would not read past the end + * of the string. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc_if_char() STMT_START { \ + RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ +} STMT_END + +/* RExC_parse_inc_by(n_octets) + * + * Increment the parse cursor by the number of octets specified by + * the 'n_octets' argument. + * + * NOTE: Does NOT check ANY constraints. It is the callers responsibility + * that this will not move past the end of the string, or leave the + * pointer in the middle of a UTF-8 sequence. + * + * Typically used to advanced past previously analyzed content. + */ +#define RExC_parse_inc_by(n_octets) STMT_START { \ + RExC_parse += (n_octets); \ +} STMT_END + +/* RExC_parse_set(to_ptr) + * + * Sets the RExC_parse pointer to the pointer specified by the 'to' + * argument. No validation whatsoever is performed on the to pointer. + */ +#define RExC_parse_set(to_ptr) STMT_START { \ + RExC_parse = (to_ptr); \ +} STMT_END + +/**********************************************************************/ + +/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set + * a flag to disable back-off on the fixed/floating substrings - if it's + * a high complexity pattern we assume the benefit of avoiding a full match + * is worth the cost of checking for the substrings even if they rarely help. + */ +#define RExC_naughty (pRExC_state->naughty) +#define TOO_NAUGHTY (10) +#define MARK_NAUGHTY(add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += (add) +#define MARK_NAUGHTY_EXP(exp, add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += RExC_naughty / (exp) + (add) + +#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ + || ((*s) == '{' && regcurly(s, e, NULL))) + +/* + * Flags to be passed up. + */ +#define HASWIDTH 0x01 /* Known to not match null strings, could match + non-null ones. */ +#define SIMPLE 0x02 /* Exactly one character wide */ + /* (or LNBREAK as a special case) */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_PARSE 0x20 /* Need to redo the parse */ +#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to + calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + +/* About the term "restudy" and the var "restudied" and the defines + * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to + * doing multiple study_chunk() calls over the same set of opcodes for* the + * purpose of enhanced TRIE optimizations. + * + * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal + * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled + * which then causes the Perl_re_op_compile() to then call the optimizer + * S_study_chunk() a second time to perform additional optimizations, + * including the aho_corasick startclass optimization. + * This additional pass will only happen once, which is managed by the + * 'restudied' variable in Perl_re_op_compile(). + * + * When this second pass is under way the flags passed into study_chunk() will + * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down + * to any recursive calls to S_study_chunk(). + * + * IMPORTANT: Any logic in study_chunk() that emits warnings should check that + * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may + * be produced twice. + * + * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and + * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. + */ + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_OFFSET(depth) \ + (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) +#define PAREN_TEST(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) +#define PAREN_SET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) +#define PAREN_UNSET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) + +#define REQUIRE_UTF8(flagp) STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_PARSE|NEED_UTF8; \ + return 0; \ + } \ + } STMT_END + +/* /u is to be chosen if we are supposed to use Unicode rules, or if the + * pattern is in UTF-8. This latter condition is in case the outermost rules + * are locale. See GH #17278 */ +#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) + +/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is + * a flag that indicates we need to override /d with /u as a result of + * something in the pattern. It should only be used in regards to calling + * set_regex_charset() or get_regex_charset() */ +#define REQUIRE_UNI_RULES(flagp, restart_retval) \ + STMT_START { \ + if (DEPENDS_SEMANTICS) { \ + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ + RExC_uni_semantics = 1; \ + if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ + /* No need to restart the parse if we haven't seen \ + * anything that differs between /u and /d, and no need \ + * to restart immediately if we're going to reparse \ + * anyway to count parens */ \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ + } \ + } \ + } STMT_END + +#define REQUIRE_BRANCHJ(flagp, restart_retval) \ + STMT_START { \ + RExC_use_BRANCHJ = 1; \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ + } STMT_END + +/* Until we have completed the parse, we leave RExC_total_parens at 0 or + * less. After that, it must always be positive, because the whole re is + * considered to be surrounded by virtual parens. Setting it to negative + * indicates there is some construct that needs to know the actual number of + * parens to be properly handled. And that means an extra pass will be + * required after we've counted them all */ +#define ALL_PARENS_COUNTED (RExC_total_parens > 0) +#define REQUIRE_PARENS_PASS \ + STMT_START { /* No-op if have completed a pass */ \ + if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ + } STMT_END +#define IN_PARENS_PASS (RExC_total_parens < 0) + + +/* This is used to return failure (zero) early from the calling function if + * various flags in 'flags' are set. Two flags always cause a return: + * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any + * additional flags that should cause a return; 0 if none. If the return will + * be done, '*flagp' is first set to be all of the flags that caused the + * return. */ +#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ + STMT_START { \ + if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ + *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ + return 0; \ + } \ + } STMT_END + +#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) + +#define RETURN_FAIL_ON_RESTART(flags,flagp) \ + RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) +#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ + if (MUST_RESTART(*(flagp))) return 0 + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* We add a marker if we are deferring expansion of a property that is both + * 1) potentiallly user-defined; and + * 2) could also be an official Unicode property. + * + * Without this marker, any deferred expansion can only be for a user-defined + * one. This marker shouldn't conflict with any that could be in a legal name, + * and is appended to its name to indicate this. There is a string and + * character form */ +#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" +#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' + +/* What is infinity for optimization purposes */ +#define OPTIMIZE_INFTY SSize_t_MAX + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to OPTIMIZE_INFTY it indicates that the + string can occur infinitely far to the right. + For fixed strings, it is equal to min_offset. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +struct scan_data_substrs { + SV *str; /* longest substring found in pattern */ + SSize_t min_offset; /* earliest point in string it can appear */ + SSize_t max_offset; /* latest point in string it can appear */ + SSize_t *minlenp; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind; /* is the pos of the string modified by LB */ + I32 flags; /* per substring SF_* and SCF_* flags */ +}; + +typedef struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + U8 cur_is_floating; /* whether the last_* values should be set as + * the next fixed (0) or floating (1) + * substring */ + + /* [0] is longest fixed substring so far, [1] is longest float so far */ + struct scan_data_substrs substrs[2]; + + I32 flags; /* common SF_* and SCF_* flags */ + I32 whilem_c; + SSize_t *last_closep; + regnode **last_close_opp; /* pointer to pointer to last CLOSE regop + seen. DO NOT DEREFERENCE the regnode + pointer - the op may have been optimized + away */ + regnode_ssc *start_class; +} scan_data_t; + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = { + 0, 0, NULL, 0, 0, 0, 0, + { + { NULL, 0, 0, 0, 0, 0 }, + { NULL, 0, 0, 0, 0, 0 }, + }, + 0, 0, NULL, NULL, NULL +}; + +/* study flags */ + +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) + +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 + + +/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the + * longest substring in the pattern. When it is not set the optimiser keeps + * track of position, but does not keep track of the actual strings seen, + * + * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but + * /foo/i will not. + * + * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" + * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be + * turned off because of the alternation (BRANCH). */ +#define SCF_DO_SUBSTR 0x0400 + +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? + Search for "restudy" in this file + to find a detailed explanation.*/ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? + Search for "restudy" in this file + to find a detailed explanation. */ +#define SCF_IN_DEFINE 0x20000 + + + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%" UTF8f MARKER2 "%" UTF8f "/" + +/* The code in this file in places uses one level of recursion with parsing + * rebased to an alternate string constructed by us in memory. This can take + * the form of something that is completely different from the input, or + * something that uses the input as part of the alternate. In the first case, + * there should be no possibility of an error, as we are in complete control of + * the alternate string. But in the second case we don't completely control + * the input portion, so there may be errors in that. Here's an example: + * /[abc\x{DF}def]/ui + * is handled specially because \x{df} folds to a sequence of more than one + * character: 'ss'. What is done is to create and parse an alternate string, + * which looks like this: + * /(?:\x{DF}|[abc\x{DF}def])/ui + * where it uses the input unchanged in the middle of something it constructs, + * which is a branch for the DF outside the character class, and clustering + * parens around the whole thing. (It knows enough to skip the DF inside the + * class while in this substitute parse.) 'abc' and 'def' may have errors that + * need to be reported. The general situation looks like this: + * + * |<------- identical ------>| + * sI tI xI eI + * Input: --------------------------------------------------------------- + * Constructed: --------------------------------------------------- + * sC tC xC eC EC + * |<------- identical ------>| + * + * sI..eI is the portion of the input pattern we are concerned with here. + * sC..EC is the constructed substitute parse string. + * sC..tC is constructed by us + * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. + * In the diagram, these are vertically aligned. + * eC..EC is also constructed by us. + * xC is the position in the substitute parse string where we found a + * problem. + * xI is the position in the original pattern corresponding to xC. + * + * We want to display a message showing the real input string. Thus we need to + * translate from xC to xI. We know that xC >= tC, since the portion of the + * string sC..tC has been constructed by us, and so shouldn't have errors. We + * get: + * xI = tI + (xC - tC) + * + * When the substitute parse is constructed, the code needs to set: + * RExC_start (sC) + * RExC_end (eC) + * RExC_copy_start_in_input (tI) + * RExC_copy_start_in_constructed (tC) + * and restore them when done. + * + * During normal processing of the input pattern, both + * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to + * sI, so that xC equals xI. + */ + +#define sI RExC_precomp +#define eI RExC_precomp_end +#define sC RExC_start +#define eC RExC_end +#define tI RExC_copy_start_in_input +#define tC RExC_copy_start_in_constructed +#define xI(xC) (tI + (xC - tC)) +#define xI_offset(xC) (xI(xC) - sI) + +#define REPORT_LOCATION_ARGS(xC) \ + UTF8fARG(UTF, \ + (xI(xC) > eI) /* Don't run off end */ \ + ? eI - sI /* Length before the <--HERE */ \ + : ((xI_offset(xC) >= 0) \ + ? xI_offset(xC) \ + : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ + IVdf " trying to output message for " \ + " pattern %.*s", \ + __FILE__, __LINE__, (IV) xI_offset(xC), \ + ((int) (eC - sC)), sC), 0)), \ + sI), /* The input pattern printed up to the <--HERE */ \ + UTF8fARG(UTF, \ + (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ + (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ + +/* Used to point after bad bytes for an error message, but avoid skipping + * past a nul byte. */ +#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) + +/* Set up to clean up after our imminent demise */ +#define PREPARE_TO_DIE \ + STMT_START { \ + if (RExC_rx_sv) \ + SAVEFREESV(RExC_rx_sv); \ + if (RExC_open_parens) \ + SAVEFREEPV(RExC_open_parens); \ + if (RExC_close_parens) \ + SAVEFREEPV(RExC_close_parens); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_precomp_end - RExC_precomp; \ + \ + PREPARE_TO_DIE; \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL3(msg,arg1,arg2) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ + arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + PREPARE_TO_DIE; \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +#define vFAIL3utf8f(m, a1, a2) STMT_START { \ + PREPARE_TO_DIE; \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* Setting this to NULL is a signal to not output warnings */ +#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ + STMT_START { \ + RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ + RExC_copy_start_in_constructed = NULL; \ + } STMT_END +#define RESTORE_WARNINGS \ + RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed + +/* Since a warning can be generated multiple times as the input is reparsed, we + * output it the first time we come to that point in the parse, but suppress it + * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not + * generate any warnings */ +#define TO_OUTPUT_WARNINGS(loc) \ + ( RExC_copy_start_in_constructed \ + && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) + +/* After we've emitted a warning, we save the position in the input so we don't + * output it again */ +#define UPDATE_WARNINGS_LOC(loc) \ + STMT_START { \ + if (TO_OUTPUT_WARNINGS(loc)) { \ + RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ + - RExC_precomp; \ + } \ + } STMT_END + +/* 'warns' is the output of the packWARNx macro used in 'code' */ +#define _WARN_HELPER(loc, warns, code) \ + STMT_START { \ + if (! RExC_copy_start_in_constructed) { \ + Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ + " expected at '%s'", \ + __FILE__, __LINE__, loc); \ + } \ + if (TO_OUTPUT_WARNINGS(loc)) { \ + if (ckDEAD(warns)) \ + PREPARE_TO_DIE; \ + code; \ + UPDATE_WARNINGS_LOC(loc); \ + } \ + } STMT_END + +/* m is not necessarily a "literal string", in this macro */ +#define warn_non_literal_string(loc, packed_warn, m) \ + _WARN_HELPER(loc, packed_warn, \ + Perl_warner(aTHX_ packed_warn, \ + "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(loc))) +#define reg_warn_non_literal_string(loc, m) \ + warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) + +#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ + STMT_START { \ + char * format; \ + Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ + Newx(format, format_size, char); \ + my_strlcpy(format, m, format_size); \ + my_strlcat(format, REPORT_LOCATION, format_size); \ + SAVEFREEPV(format); \ + _WARN_HELPER(loc, packwarn, \ + Perl_ck_warner(aTHX_ packwarn, \ + format, \ + a1, REPORT_LOCATION_ARGS(loc))); \ + } STMT_END + +#define ckWARNreg(loc,m) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN(loc, m) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) \ + +#define vWARN_dep(loc, m) \ + _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNdep(loc,m) \ + _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNregdep(loc,m) \ + _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ + WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARN2reg_d(loc,m, a1) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc))) + +#define ckWARN2reg(loc, m, a1) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc))) + +#define vWARN3(loc, m, a1, a2) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(loc))) + +#define ckWARN3reg(loc, m, a1, a2) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN4(loc, m, a1, a2, a3) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARN4reg(loc, m, a1, a2, a3) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN5(loc, m, a1, a2, a3, a4) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, a4, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNexperimental(loc, class, m) \ + STMT_START { \ + if (! RExC_warned_ ## class) { /* warn once per compilation */ \ + RExC_warned_ ## class = 1; \ + _WARN_HELPER(loc, packWARN(class), \ + Perl_ck_warner_d(aTHX_ packWARN(class), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)));\ + } \ + } STMT_END + +#define ckWARNexperimental_with_arg(loc, class, m, arg) \ + STMT_START { \ + if (! RExC_warned_ ## class) { /* warn once per compilation */ \ + RExC_warned_ ## class = 1; \ + _WARN_HELPER(loc, packWARN(class), \ + Perl_ck_warner_d(aTHX_ packWARN(class), \ + m REPORT_LOCATION, \ + arg, REPORT_LOCATION_ARGS(loc)));\ + } \ + } STMT_END + +/* Convert between a pointer to a node and its offset from the beginning of the + * program */ +#define REGNODE_p(offset) (RExC_emit_start + (offset)) +#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ + (SSize_t) ((node) - RExC_emit_start)) + +#define ProgLen(ri) ri->proglen +#define SetProgLen(ri,x) ri->proglen = x + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + Perl_re_printf( aTHX_ "RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + Perl_re_printf( aTHX_ "\n"); \ + }); + +#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ + if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) + + +#ifdef DEBUGGING +# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ + debug_studydata(where, data, depth, is_inf, min, stopmin, delta) + +# define DEBUG_PEEP(str, scan, depth, flags) \ + debug_peep(str, pRExC_state, scan, depth, flags) +#else +# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP +# define DEBUG_PEEP(str, scan, depth, flags) NOOP +#endif + +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +#define INVLIST_INDEX 0 +#define ONLY_LOCALE_MATCHES_INDEX 1 +#define DEFERRED_USER_DEFINED_INDEX 2 + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +#ifdef DEBUGGING +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts_debug(state,op,extra_size) +#else +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts(state,extra_size) +#endif + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ + " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +#endif /* REGCOMP_INTERNAL_H */ diff --git a/regcomp_invlist.c b/regcomp_invlist.c new file mode 100644 index 0000000000..9ea3f43181 --- /dev/null +++ b/regcomp_invlist.c @@ -0,0 +1,1540 @@ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +#include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY +#define PERL_IN_REGCOMP_INVLIST_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "invlist_inline.h" +#include "unicode_constants.h" +#include "regcomp_internal.h" + + +void +Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) +{ + PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST; + + /* As the name says. The zeroth bit corresponds to the code point given by + * 'offset' */ + + UV start, end; + + Zero(bitmap, len, U8); + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + assert(start >= offset); + + for (UV i = start; i <= end; i++) { + UV adjusted = i - offset; + + BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted); + } + } + invlist_iterfinish(invlist); +} + +void +Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset) +{ + PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP; + + /* As the name says. The zeroth bit corresponds to the code point given by + * 'offset' */ + + Size_t i; + + for (i = 0; i < bitmap_len; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + + /* Save a little work by adding a range all at once instead of bit + * by bit */ + while (i < bitmap_len && BITMAP_TEST(bitmap, i)) { + i++; + } + + *invlist = _add_range_to_invlist(*invlist, + start + offset, + i + offset - 1); + } + } +} + +/* This section of code defines the inversion list object and its methods. The + * interfaces are highly subject to change, so as much as possible is static to + * this file. An inversion list is here implemented as a malloc'd C UV array + * as an SVt_INVLIST scalar. + * + * An inversion list for Unicode is an array of code points, sorted by ordinal + * number. Each element gives the code point that begins a range that extends + * up-to but not including the code point given by the next element. The final + * element gives the first code point of a range that extends to the platform's + * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], + * ...) give ranges whose code points are all in the inversion list. We say + * that those ranges are in the set. The odd-numbered elements give ranges + * whose code points are not in the inversion list, and hence not in the set. + * Thus, element [0] is the first code point in the list. Element [1] + * is the first code point beyond that not in the list; and element [2] is the + * first code point beyond that that is in the list. In other words, the first + * range is invlist[0]..(invlist[1]-1), and all code points in that range are + * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and + * all code points in that range are not in the inversion list. The third + * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion + * list, and so forth. Thus every element whose index is divisible by two + * gives the beginning of a range that is in the list, and every element whose + * index is not divisible by two gives the beginning of a range not in the + * list. If the final element's index is divisible by two, the inversion list + * extends to the platform's infinity; otherwise the highest code point in the + * inversion list is the contents of that element minus 1. + * + * A range that contains just a single code point N will look like + * invlist[i] == N + * invlist[i+1] == N+1 + * + * If N is UV_MAX (the highest representable code point on the machine), N+1 is + * impossible to represent, so element [i+1] is omitted. The single element + * inversion list + * invlist[0] == UV_MAX + * contains just UV_MAX, but is interpreted as matching to infinity. + * + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion + * list to always contain 0; there is an additional flag in the header which + * indicates if the list begins at the 0, or is offset to begin at the next + * element. This means that the inversion list can be inverted without any + * copying; just flip the flag. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. + * + * Some of the methods should always be private to the implementation, and some + * should eventually be made public */ + +/* The header definitions are in F<invlist_inline.h> */ + +#ifndef PERL_IN_XSUB_RE + +PERL_STATIC_INLINE UV* +S__invlist_array_init(SV* const invlist, const bool will_have_0) +{ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 in it + * or not. The other parameter tells it whether the code that follows this + * call is about to put a 0 in the inversion list or not. The first + * element is either the element reserved for 0, if TRUE, or the element + * after it, if FALSE */ + + bool* offset = get_invlist_offset_addr(invlist); + UV* zero_addr = (UV *) SvPVX(invlist); + + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; + + /* Must be empty */ + assert(! _invlist_len(invlist)); + + *zero_addr = 0; + + /* 1^1 = 0; 1^0 = 1 */ + *offset = 1 ^ will_have_0; + return zero_addr + *offset; +} + +STATIC void +S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) +{ + /* Replaces the inversion list in 'dest' with the one from 'src'. It + * steals the list from 'src', so 'src' is made to have a NULL list. This + * is similar to what SvSetMagicSV() would do, if it were implemented on + * inversion lists, though this routine avoids a copy */ + + const UV src_len = _invlist_len(src); + const bool src_offset = *get_invlist_offset_addr(src); + const STRLEN src_byte_len = SvLEN(src); + char * array = SvPVX(src); + +#ifndef NO_TAINT_SUPPORT + const int oldtainted = TAINT_get; +#endif + + PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; + + assert(is_invlist(src)); + assert(is_invlist(dest)); + assert(! invlist_is_iterating(src)); + assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); + + /* Make sure it ends in the right place with a NUL, as our inversion list + * manipulations aren't careful to keep this true, but sv_usepvn_flags() + * asserts it */ + array[src_byte_len - 1] = '\0'; + + TAINT_NOT; /* Otherwise it breaks */ + sv_usepvn_flags(dest, + (char *) array, + src_byte_len - 1, + + /* This flag is documented to cause a copy to be avoided */ + SV_HAS_TRAILING_NUL); + TAINT_set(oldtainted); + SvPV_set(src, 0); + SvLEN_set(src, 0); + SvCUR_set(src, 0); + + /* Finish up copying over the other fields in an inversion list */ + *get_invlist_offset_addr(dest) = src_offset; + invlist_set_len(dest, src_len, src_offset); + *get_invlist_previous_index_addr(dest) = 0; + invlist_iterfinish(dest); +} + +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(SV* invlist) +{ + /* Return the address of the IV that is reserved to hold the cached index + * */ + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + assert(is_invlist(invlist)); + + return &(((XINVLIST*) SvANY(invlist))->prev_index); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(SV* const invlist, const IV index) +{ + /* Caches <index> for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + +PERL_STATIC_INLINE void +S_invlist_trim(SV* invlist) +{ + /* Free the not currently-being-used space in an inversion list */ + + /* But don't free up the space needed for the 0 UV that is always at the + * beginning of the list, nor the trailing NUL */ + const UV min_size = TO_INTERNAL_SIZE(1) + 1; + + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(is_invlist(invlist)); + + SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ +{ + PERL_ARGS_ASSERT_INVLIST_CLEAR; + + assert(is_invlist(invlist)); + + invlist_set_len(invlist, 0, 0); + invlist_trim(invlist); +} + +PERL_STATIC_INLINE UV +S_invlist_max(const SV* const invlist) +{ + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ + + PERL_ARGS_ASSERT_INVLIST_MAX; + + assert(is_invlist(invlist)); + + /* Assumes worst case, in which the 0 element is not counted in the + * inversion list, so subtracts 1 for that */ + return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ + ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 + : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; +} + +STATIC void +S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) +{ + PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS; + + /* First 1 is in case the zero element isn't in the list; second 1 is for + * trailing NUL */ + SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1); + invlist_set_len(invlist, 0, 0); + + /* Force iterinit() to be used to get iteration to work */ + invlist_iterfinish(invlist); + + *get_invlist_previous_index_addr(invlist) = 0; + SvPOK_on(invlist); /* This allows B to extract the PV */ +} + +SV* +Perl__new_invlist(pTHX_ IV initial_size) +{ + + /* Return a pointer to a newly constructed inversion list, with enough + * space to store 'initial_size' elements. If that number is negative, a + * system default is used instead */ + + SV* new_list; + + if (initial_size < 0) { + initial_size = 10; + } + + new_list = newSV_type(SVt_INVLIST); + initialize_invlist_guts(new_list, initial_size); + + return new_list; +} + +SV* +Perl__new_invlist_C_array(pTHX_ const UV* const list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to <list>, which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands. The passed in 'list' contains + * several header fields at the beginning that are not part of the + * inversion list body proper */ + + const STRLEN length = (STRLEN) list[0]; + const UV version_id = list[1]; + const bool offset = cBOOL(list[2]); +#define HEADER_LENGTH 3 + /* If any of the above changes in any way, you must change HEADER_LENGTH + * (if appropriate) and regenerate INVLIST_VERSION_ID by running + * perl -E 'say int(rand 2**31-1)' + */ +#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and + data structure type, so that one being + passed in can be validated to be an + inversion list of the correct vintage. + */ + + SV* invlist = newSV_type(SVt_INVLIST); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + if (version_id != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + /* The generated array passed in includes header elements that aren't part + * of the list proper, so start it just after them */ + SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); + + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + + *(get_invlist_offset_addr(invlist)) = offset; + + /* The 'length' passed to us is the physical number of elements in the + * inversion list. But if there is an offset the logical number is one + * less than that */ + invlist_set_len(invlist, length - offset, offset); + + invlist_set_previous_index(invlist, 0); + + /* Initialize the iteration pointer. */ + invlist_iterfinish(invlist); + + SvREADONLY_on(invlist); + SvPOK_on(invlist); + + return invlist; +} + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, + const UV start, const UV end) +{ + /* Subject to change or removal. Append the range from 'start' to 'end' at + * the end of the inversion list. The range must be above any existing + * ones. */ + + UV* array; + UV max = invlist_max(invlist); + UV len = _invlist_len(invlist); + bool offset; + + PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; + + if (len == 0) { /* Empty lists must be initialized */ + offset = start != 0; + array = _invlist_array_init(invlist, ! offset); + } + else { + /* Here, the existing list is non-empty. The current max entry in the + * list is generally the first value not in the set, except when the + * set extends to the end of permissible values, in which case it is + * the first entry in that final set, and so this call is an attempt to + * append out-of-order */ + + UV final_element = len - 1; + array = invlist_array(invlist); + if ( array[final_element] > start + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) + { + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); + } + + /* Here, it is a legal append. If the new range begins 1 above the end + * of the range below it, it is extending the range below it, so the + * new first value not in the set is one greater than the newly + * extended range. */ + offset = *get_invlist_offset_addr(invlist); + if (array[final_element] == start) { + if (end != UV_MAX) { + array[final_element] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, + * assume that infinity was actually what was meant. Just let + * the range that this would extend to have no end */ + invlist_set_len(invlist, len - 1, offset); + } + return; + } + } + + /* Here the new range doesn't extend any existing set. Add it */ + + len += 2; /* Includes an element each for the start and end of range */ + + /* If wll overflow the existing space, extend, which may cause the array to + * be moved */ + if (max < len) { + invlist_extend(invlist, len); + + /* Have to set len here to avoid assert failure in invlist_array() */ + invlist_set_len(invlist, len, offset); + + array = invlist_array(invlist); + } + else { + invlist_set_len(invlist, len, offset); + } + + /* The next item on the list starts the range, the one after that is + * one past the new range. */ + array[len - 2] = start; + if (end != UV_MAX) { + array[len - 1] = end + 1; + } + else { + /* But if the end is the maximum representable on the machine, just let + * the range have no end */ + invlist_set_len(invlist, len - 1, offset); + } +} + +SSize_t +Perl__invlist_search(SV* const invlist, const UV cp) +{ + /* Searches the inversion list for the entry that contains the input code + * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains <cp>, that is, 'i' such that + * array[i] <= cp < array[i+1] + */ + + IV low = 0; + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; + + PERL_ARGS_ASSERT__INVLIST_SEARCH; + + /* If list is empty, return failure. */ + if (UNLIKELY(high == 0)) { + return -1; + } + + /* (We can't get the array unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0); + if (UNLIKELY(mid > highest_element)) { + mid = highest_element; + } + + /* <mid> contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that <mid> is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + + /* Binary search. What we are looking for is <i> such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ + while (low < high) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; +} + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point '*output' to it. On + * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly + * even 'a' or 'b'). If to an inversion list, the contents of the original + * list will be replaced by the union. The first list, 'a', may be + * NULL, in which case a copy of the second list is placed in '*output'. + * If 'complement_b' is TRUE, the union is taken of the complement + * (inversion) of 'b' instead of b itself. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. + * + * The algorithm is like a merge sort. */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* u; /* the resulting union */ + UV* array_u; + UV len_u = 0; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_u = 0; + + /* running count, as explained in the algorithm source book; items are + * stopped accumulating and are output when the count changes to/from 0. + * The count is incremented when we start a range that's in an input's set, + * and decremented when we start a range that's not in a set. So this + * variable can be 0, 1, or 2. When it is 0 neither input is in their set, + * and hence nothing goes into the union; 1, just one of the inputs is in + * its set (and its current range gets added to the union); and 2 when both + * inputs are in their sets. */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + assert(*output == NULL || is_invlist(*output)); + + len_b = _invlist_len(b); + if (len_b == 0) { + + /* Here, 'b' is empty, hence it's complement is all possible code + * points. So if the union includes the complement of 'b', it includes + * everything, and we need not even look at 'a'. It's easiest to + * create a new inversion list that matches everything. */ + if (complement_b) { + SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); + + if (*output == NULL) { /* If the output didn't exist, just point it + at the new list */ + *output = everything; + } + else { /* Otherwise, replace its contents with the new list */ + invlist_replace_list_destroys_src(*output, everything); + SvREFCNT_dec_NN(everything); + } + + return; + } + + /* Here, we don't want the complement of 'b', and since 'b' is empty, + * the union will come entirely from 'a'. If 'a' is NULL or empty, the + * output will be empty */ + + if (a == NULL || _invlist_len(a) == 0) { + if (*output == NULL) { + *output = _new_invlist(0); + } + else { + invlist_clear(*output); + } + return; + } + + /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the + * union. We can just return a copy of 'a' if '*output' doesn't point + * to an existing list */ + if (*output == NULL) { + *output = invlist_clone(a, NULL); + return; + } + + /* If the output is to overwrite 'a', we have a no-op, as it's + * already in 'a' */ + if (*output == a) { + return; + } + + /* Here, '*output' is to be overwritten by 'a' */ + u = invlist_clone(a, NULL); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + + return; + } + + /* Here 'b' is not empty. See about 'a' */ + + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + + /* Here, 'a' is empty (and b is not). That means the union will come + * entirely from 'b'. If '*output' is NULL, we can directly return a + * clone of 'b'. Otherwise, we replace the contents of '*output' with + * the clone */ + + SV ** dest = (*output == NULL) ? output : &u; + *dest = invlist_clone(b, NULL); + if (complement_b) { + _invlist_invert(*dest); + } + + if (dest == &u) { + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + } + + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the union for the worst case: that the sets are completely + * disjoint */ + u = _new_invlist(len_a + len_b); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); + + /* Go through each input list item by item, stopping when have exhausted + * one of them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the union's array */ + bool cp_in_set; /* is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the union. + * Since we are merging two sorted lists, we take the smaller of the + * next items. In case of a tie, we take first the one that is in its + * set. If we first took the one not in its set, it would decrement + * the count, possibly to 0 which would cause it to be output as ending + * the range, and the next time through we would take the same number, + * and output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp = array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp = array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 0, which marks the + * beginning/end of a range that's in the set */ + if (cp_in_set) { + if (count == 0) { + array_u[i_u++] = cp; + } + count++; + } + else { + count--; + if (count == 0) { + array_u[i_u++] = cp; + } + } + } + + + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We decrement 'count', as explained below, if + * that list is in its set. (i_a and i_b each currently index the element + * beyond the one we care about.) */ + if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count--; + } + + /* Above we decremented 'count' if the list that had unexamined elements in + * it was in its set. This has made it so that 'count' being non-zero + * means there isn't anything left to output; and 'count' equal to 0 means + * that what is left to output is precisely that which is left in the + * non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to add to the union. If it was in its set at its end, that means + * the set extends from here to the platform's infinity, and hence so does + * the union and the non-exhausted set is irrelevant. The exhausted set + * also contributed 1 to 'count'. If 'count' was 2, it got decremented to + * 1, but if it was 1, the non-exhausted set wasn't in its set, and so + * 'count' remains at 1. This is consistent with the decremented 'count' + * != 0 meaning there's nothing left to add to the union. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the rest of the union will be whatever the other input is. + * If 'count' was 0, neither list was in its set, and 'count' remains 0; + * otherwise it gets decremented to 0. This is consistent with 'count' + * == 0 meaning the remainder of the union is whatever is left in the + * non-exhausted list. */ + if (count != 0) { + len_u = i_u; + } + else { + IV copy_count = len_a - i_a; + if (copy_count > 0) { /* The non-exhausted input is 'a' */ + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else { /* The non-exhausted input is b */ + copy_count = len_b - i_b; + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + len_u = i_u + copy_count; + } + + /* Set the result to the final length, which can change the pointer to + * array_u, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ + if (len_u != _invlist_len(u)) { + invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); + invlist_trim(u); + array_u = invlist_array(u); + } + + if (*output == NULL) { /* Simply return the new inversion list */ + *output = u; + } + else { + /* Otherwise, overwrite the inversion list that was in '*output'. We + * could instead free '*output', and then set it to 'u', but experience + * has shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. */ + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + } + + return; +} + +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, + const bool complement_b, SV** i) +{ + /* Take the intersection of two inversion lists and point '*i' to it. On + * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly + * even 'a' or 'b'). If to an inversion list, the contents of the original + * list will be replaced by the intersection. The first list, 'a', may be + * NULL, in which case '*i' will be an empty list. If 'complement_b' is + * TRUE, the result will be the intersection of 'a' and the complement (or + * inversion) of 'b' instead of 'b' directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs + * + * The algorithm is like a merge sort, and is essentially the same as the + * union above + */ + + const UV* array_a; /* a's array */ + const UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; + + SV* r; /* the resulting intersection */ + UV* array_r; + UV len_r = 0; + + UV i_a = 0; /* current index into a's array */ + UV i_b = 0; + UV i_r = 0; + + /* running count of how many of the two inputs are postitioned at ranges + * that are in their sets. As explained in the algorithm source book, + * items are stopped accumulating and are output when the count changes + * to/from 2. The count is incremented when we start a range that's in an + * input's set, and decremented when we start a range that's not in a set. + * Only when it is 2 are we in the intersection. */ + UV count = 0; + + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + assert(*i == NULL || is_invlist(*i)); + + /* Special case if either one is empty */ + len_a = (a == NULL) ? 0 : _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' + * must be empty. Here, also we are using 'b's complement, which + * hence must be every possible code point. Thus the intersection + * is simply 'a'. */ + + if (*i == a) { /* No-op */ + return; + } + + if (*i == NULL) { + *i = invlist_clone(a, NULL); + return; + } + + r = invlist_clone(a, NULL); + invlist_replace_list_destroys_src(*i, r); + SvREFCNT_dec_NN(r); + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == NULL) { + *i = _new_invlist(0); + return; + } + + invlist_clear(*i); + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + /* Size the intersection for the worst case: that the intersection ends up + * fragmenting everything to be completely disjoint */ + r= _new_invlist(len_a + len_b); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); + + /* Go through each list item by item, stopping when have exhausted one of + * them */ + while (i_a < len_a && i_b < len_b) { + UV cp; /* The element to potentially add to the intersection's + array */ + bool cp_in_set; /* Is it in the input list's set or not */ + + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take first the one + * that is not in its set (a difference from the union algorithm). If + * we first took the one in its set, it would increment the count, + * possibly to 2 which would cause it to be output as starting a range + * in the intersection, and the next time through we would take that + * same number, and output it again as ending the set. By doing the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) + { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); + cp = array_a[i_a++]; + } + else { + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); + cp= array_b[i_b++]; + } + + /* Here, have chosen which of the two inputs to look at. Only output + * if the running count changes to/from 2, which marks the + * beginning/end of a range that's in the intersection */ + if (cp_in_set) { + count++; + if (count == 2) { + array_r[i_r++] = cp; + } + } + else { + if (count == 2) { + array_r[i_r++] = cp; + } + count--; + } + + } + + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We increment 'count', as explained below, if the + * exhausted list was in its set. (i_a and i_b each currently index the + * element beyond the one we care about.) */ + if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + { + count++; + } + + /* Above we incremented 'count' if the exhausted list was in its set. This + * has made it so that 'count' being below 2 means there is nothing left to + * output; otheriwse what's left to add to the intersection is precisely + * that which is left in the non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to affect the intersection. If it was in its set at its end, that + * means the set extends from here to the platform's infinity, and hence + * anything in the non-exhausted's list will be in the intersection, and + * anything not in it won't be. Hence, the rest of the intersection is + * precisely what's in the non-exhausted list The exhausted set also + * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing + * it means 'count' is now at least 2. This is consistent with the + * incremented 'count' being >= 2 means to add the non-exhausted list to + * the intersection. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the intersection can't include anything further; the + * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get + * incremented. This is consistent with 'count' being < 2 meaning nothing + * further to add to the intersection. */ + if (count < 2) { /* Nothing left to put in the intersection. */ + len_r = i_r; + } + else { /* copy the non-exhausted list, unchanged. */ + IV copy_count = len_a - i_a; + if (copy_count > 0) { /* a is the one with stuff left */ + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else { /* b is the one with stuff left */ + copy_count = len_b - i_b; + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + len_r = i_r + copy_count; + } + + /* Set the result to the final length, which can change the pointer to + * array_r, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ + if (len_r != _invlist_len(r)) { + invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); + invlist_trim(r); + array_r = invlist_array(r); + } + + if (*i == NULL) { /* Simply return the calculated intersection */ + *i = r; + } + else { /* Otherwise, replace the existing inversion list in '*i'. We could + instead free '*i', and then set it to 'r', but experience has + shown [perl #127392] that if the input is a mortal, we can get a + huge build-up of these during regex compilation before they get + freed. */ + if (len_r) { + invlist_replace_list_destroys_src(*i, r); + } + else { + invlist_clear(*i); + } + SvREFCNT_dec_NN(r); + } + + return; +} + +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) +{ + /* Add the range from 'start' to 'end' inclusive to the inversion list's + * set. A pointer to the inversion list is returned. This may actually be + * a new list, in which case the passed in one has been destroyed. The + * passed-in inversion list can be NULL, in which case a new one is created + * with just the one range in it. The new list is not necessarily + * NUL-terminated. Space is not freed if the inversion list shrinks as a + * result of this function. The gain would not be large, and in many + * cases, this is called multiple times on a single inversion list, so + * anything freed may almost immediately be needed again. + * + * This used to mostly call the 'union' routine, but that is much more + * heavyweight than really needed for a single range addition */ + + UV* array; /* The array implementing the inversion list */ + UV len; /* How many elements in 'array' */ + SSize_t i_s; /* index into the invlist array where 'start' + should go */ + SSize_t i_e = 0; /* And the index where 'end' should go */ + UV cur_highest; /* The highest code point in the inversion list + upon entry to this function */ + + /* This range becomes the whole inversion list if none already existed */ + if (invlist == NULL) { + invlist = _new_invlist(2); + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Likewise, if the inversion list is currently empty */ + len = _invlist_len(invlist); + if (len == 0) { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Starting here, we have to know the internals of the list */ + array = invlist_array(invlist); + + /* If the new range ends higher than the current highest ... */ + cur_highest = invlist_highest(invlist); + if (end > cur_highest) { + + /* If the whole range is higher, we can just append it */ + if (start > cur_highest) { + _append_range_to_invlist(invlist, start, end); + return invlist; + } + + /* Otherwise, add the portion that is higher ... */ + _append_range_to_invlist(invlist, cur_highest + 1, end); + + /* ... and continue on below to handle the rest. As a result of the + * above append, we know that the index of the end of the range is the + * final even numbered one of the array. Recall that the final element + * always starts a range that extends to infinity. If that range is in + * the set (meaning the set goes from here to infinity), it will be an + * even index, but if it isn't in the set, it's odd, and the final + * range in the set is one less, which is even. */ + if (end == UV_MAX) { + i_e = len; + } + else { + i_e = len - 2; + } + } + + /* We have dealt with appending, now see about prepending. If the new + * range starts lower than the current lowest ... */ + if (start < array[0]) { + + /* Adding something which has 0 in it is somewhat tricky, and uncommon. + * Let the union code handle it, rather than having to know the + * trickiness in two code places. */ + if (UNLIKELY(start == 0)) { + SV* range_invlist; + + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); + + _invlist_union(invlist, range_invlist, &invlist); + + SvREFCNT_dec_NN(range_invlist); + + return invlist; + } + + /* If the whole new range comes before the first entry, and doesn't + * extend it, we have to insert it as an additional range */ + if (end < array[0] - 1) { + i_s = i_e = -1; + goto splice_in_new_range; + } + + /* Here the new range adjoins the existing first range, extending it + * downwards. */ + array[0] = start; + + /* And continue on below to handle the rest. We know that the index of + * the beginning of the range is the first one of the array */ + i_s = 0; + } + else { /* Not prepending any part of the new range to the existing list. + * Find where in the list it should go. This finds i_s, such that: + * invlist[i_s] <= start < array[i_s+1] + */ + i_s = _invlist_search(invlist, start); + } + + /* At this point, any extending before the beginning of the inversion list + * and/or after the end has been done. This has made it so that, in the + * code below, each endpoint of the new range is either in a range that is + * in the set, or is in a gap between two ranges that are. This means we + * don't have to worry about exceeding the array bounds. + * + * Find where in the list the new range ends (but we can skip this if we + * have already determined what it is, or if it will be the same as i_s, + * which we already have computed) */ + if (i_e == 0) { + i_e = (start == end) + ? i_s + : _invlist_search(invlist, end); + } + + /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] + * is a range that goes to infinity there is no element at invlist[i_e+1], + * so only the first relation holds. */ + + if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { + + /* Here, the ranges on either side of the beginning of the new range + * are in the set, and this range starts in the gap between them. + * + * The new range extends the range above it downwards if the new range + * ends at or above that range's start */ + const bool extends_the_range_above = ( end == UV_MAX + || end + 1 >= array[i_s+1]); + + /* The new range extends the range below it upwards if it begins just + * after where that range ends */ + if (start == array[i_s]) { + + /* If the new range fills the entire gap between the other ranges, + * they will get merged together. Other ranges may also get + * merged, depending on how many of them the new range spans. In + * the general case, we do the merge later, just once, after we + * figure out how many to merge. But in the case where the new + * range exactly spans just this one gap (possibly extending into + * the one above), we do the merge here, and an early exit. This + * is done here to avoid having to special case later. */ + if (i_e - i_s <= 1) { + + /* If i_e - i_s == 1, it means that the new range terminates + * within the range above, and hence 'extends_the_range_above' + * must be true. (If the range above it extends to infinity, + * 'i_s+2' will be above the array's limit, but 'len-i_s-2' + * will be 0, so no harm done.) */ + if (extends_the_range_above) { + Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); + invlist_set_len(invlist, + len - 2, + *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* Here, i_e must == i_s. We keep them in sync, as they apply + * to the same range, and below we are about to decrement i_s + * */ + i_e--; + } + + /* Here, the new range is adjacent to the one below. (It may also + * span beyond the range above, but that will get resolved later.) + * Extend the range below to include this one. */ + array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; + i_s--; + start = array[i_s]; + } + else if (extends_the_range_above) { + + /* Here the new range only extends the range above it, but not the + * one below. It merges with the one above. Again, we keep i_e + * and i_s in sync if they point to the same range */ + if (i_e == i_s) { + i_e++; + } + i_s++; + array[i_s] = start; + } + } + + /* Here, we've dealt with the new range start extending any adjoining + * existing ranges. + * + * If the new range extends to infinity, it is now the final one, + * regardless of what was there before */ + if (UNLIKELY(end == UV_MAX)) { + invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* If i_e started as == i_s, it has also been dealt with, + * and been updated to the new i_s, which will fail the following if */ + if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { + + /* Here, the ranges on either side of the end of the new range are in + * the set, and this range ends in the gap between them. + * + * If this range is adjacent to (hence extends) the range above it, it + * becomes part of that range; likewise if it extends the range below, + * it becomes part of that range */ + if (end + 1 == array[i_e+1]) { + i_e++; + array[i_e] = start; + } + else if (start <= array[i_e]) { + array[i_e] = end + 1; + i_e--; + } + } + + if (i_s == i_e) { + + /* If the range fits entirely in an existing range (as possibly already + * extended above), it doesn't add anything new */ + if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { + return invlist; + } + + /* Here, no part of the range is in the list. Must add it. It will + * occupy 2 more slots */ + splice_in_new_range: + + invlist_extend(invlist, len + 2); + array = invlist_array(invlist); + /* Move the rest of the array down two slots. Don't include any + * trailing NUL */ + Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); + + /* Do the actual splice */ + array[i_e+1] = start; + array[i_e+2] = end + 1; + invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* Here the new range crossed the boundaries of a pre-existing range. The + * code above has adjusted things so that both ends are in ranges that are + * in the set. This means everything in between must also be in the set. + * Just squash things together */ + Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); + invlist_set_len(invlist, + len - i_e + i_s, + *(get_invlist_offset_addr(invlist))); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + invlist = add_cp_to_invlist(invlist, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; +} + +#endif + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + assert(! invlist_is_iterating(invlist)); + + /* The inverse of matching nothing is matching everything */ + if (_invlist_len(invlist) == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); +} + +SV* +Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) +{ + /* Return a new inversion list that is a copy of the input one, which is + * unchanged. The new list will not be mortal even if the old one was. */ + + const STRLEN nominal_length = _invlist_len(invlist); + const STRLEN physical_length = SvCUR(invlist); + const bool offset = *(get_invlist_offset_addr(invlist)); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + if (new_invlist == NULL) { + new_invlist = _new_invlist(nominal_length); + } + else { + sv_upgrade(new_invlist, SVt_INVLIST); + initialize_invlist_guts(new_invlist, nominal_length); + } + + *(get_invlist_offset_addr(new_invlist)) = offset; + invlist_set_len(new_invlist, nominal_length, offset); + Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); + + return new_invlist; +} + +#endif + + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, + const char * const indent, SV* const invlist) +{ + /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the + * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by + * the string 'indent'. The output looks like this: + [0] 0x000A .. 0x000D + [2] 0x0085 + [4] 0x2028 .. 0x2029 + [6] 0x3104 .. INFTY + * This means that the first range of code points matched by the list are + * 0xA through 0xD; the second range contains only the single code point + * 0x85, etc. An inversion list is an array of UVs. Two array elements + * are used to define each range (except if the final range extends to + * infinity, only a single element is needed). The array index of the + * first element for the corresponding range is given in brackets. */ + + UV start, end; + STRLEN count = 0; + + PERL_ARGS_ASSERT__INVLIST_DUMP; + + if (invlist_is_iterating(invlist)) { + Perl_dump_indent(aTHX_ level, file, + "%sCan't dump inversion list because is in middle of iterating\n", + indent); + return; + } + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_dump_indent(aTHX_ level, file, + "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n", + indent, (UV)count, start); + } + else if (end != start) { + Perl_dump_indent(aTHX_ level, file, + "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", + indent, (UV)count, start, end); + } + else { + Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", + indent, (UV)count, start); + } + count += 2; + } +} + +#endif + +#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) +bool +Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + const UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + const UV* array_a = NULL; + const UV* array_b = NULL; + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* This code avoids accessing the arrays unless it knows the length is + * non-zero */ + + if (len_a == 0) { + if (len_b == 0) { + return ! complement_b; + } + } + else { + array_a = invlist_array(a); + } + + if (len_b != 0) { + array_b = invlist_array(b); + } + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so <a> would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later */ + + array_b++; + len_b--; + } + else { + + /* But if the first element is not zero, we pretend the list starts + * at the 0 that is always stored immediately before the array. */ + array_b--; + len_b++; + } + } + + return len_a == len_b + && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); + +} +#endif + +#undef HEADER_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_VERSION_ID + +/* End of inversion list object */ diff --git a/regcomp_study.c b/regcomp_study.c new file mode 100644 index 0000000000..bf61d34f48 --- /dev/null +++ b/regcomp_study.c @@ -0,0 +1,3808 @@ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +#include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY +#define PERL_IN_REGCOMP_STUDY_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "invlist_inline.h" +#include "unicode_constants.h" +#include "regcomp_internal.h" + +#define INIT_AND_WITHP \ + assert(!and_withp); \ + Newx(and_withp, 1, regnode_ssc); \ + SAVEFREEPV(and_withp) + + +STATIC void +S_unwind_scan_frames(pTHX_ const void *p) +{ + PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES; + scan_frame *f= (scan_frame *)p; + do { + scan_frame *n= f->next_frame; + Safefree(f); + f= n; + } while (f); +} + +/* Follow the next-chain of the current node and optimize away + all the NOTHINGs from it. + */ +STATIC void +S_rck_elide_nothing(pTHX_ regnode *node) +{ + PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; + + if (OP(node) != CURLYX) { + const int max = (REGNODE_OFF_BY_ARG(OP(node)) + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); + int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node)); + int noff; + regnode *n = node; + + /* Skip NOTHING and LONGJMP. */ + while ( + (n = regnext(n)) + && ( + (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n))) + || ((OP(n) == LONGJMP) && (noff = ARG(n))) + ) + && off + noff < max + ) { + off += noff; + } + if (REGNODE_OFF_BY_ARG(OP(node))) + ARG(node) = off; + else + NEXT_OFF(node) = off; + } + return; +} + + +/* + * As best we can, determine the characters that can match the start of + * the given EXACTF-ish node. This is for use in creating ssc nodes, so there + * can be false positive matches + * + * Returns the invlist as a new SV*; it is the caller's responsibility to + * call SvREFCNT_dec() when done with it. + */ +STATIC SV* +S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) +{ + const U8 * s = (U8*)STRING(node); + SSize_t bytelen = STR_LEN(node); + UV uc; + /* Start out big enough for 2 separate code points */ + SV* invlist = _new_invlist(4); + + PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; + + if (! UTF) { + uc = *s; + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { + /* Any Latin1 range character can potentially match any + * other depending on the locale, and in Turkic locales, 'I' and + * 'i' can match U+130 and U+131 */ + if (OP(node) == EXACTFL) { + _invlist_union(invlist, PL_Latin1, &invlist); + if (isALPHA_FOLD_EQ(uc, 'I')) { + invlist = add_cp_to_invlist(invlist, + LATIN_SMALL_LETTER_DOTLESS_I); + invlist = add_cp_to_invlist(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + invlist = add_cp_to_invlist(invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) + invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, + EXACTFAA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + const U8* e = s + bytelen; + IV fc; + + fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + + /* The only code points that aren't folded in a UTF EXACTFish + * node are the problematic ones in EXACTFL nodes */ + if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + fc = -1; + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + if (fc < 0) { /* Save the first fold */ + fc = *(d-1); + } + s++; + } + else { + STRLEN len; + UV fold = toFOLD_utf8_safe(s, e, d, &len); + if (fc < 0) { /* Save the first fold */ + fc = fold; + } + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. + * + * Like the non-UTF case above, we punt if the node begins with a + * multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { /* Single char fold */ + unsigned int k; + U32 first_fold; + const U32 * remaining_folds; + Size_t folds_count; + + /* It matches itself */ + invlist = add_cp_to_invlist(invlist, fc); + + /* ... plus all the things that fold to it, which are found in + * PL_utf8_foldclosures */ + folds_count = _inverse_folds(fc, &first_fold, + &remaining_folds); + for (k = 0; k < folds_count; k++) { + UV c = (k == 0) ? first_fold : remaining_folds[k-1]; + + /* /aa doesn't allow folds between ASCII and non- */ + if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) + && isASCII(c) != isASCII(fc)) + { + continue; + } + + invlist = add_cp_to_invlist(invlist, c); + } + + if (OP(node) == EXACTFL) { + + /* If either [iI] are present in an EXACTFL node the above code + * should have added its normal case pair, but under a Turkish + * locale they could match instead the case pairs from it. Add + * those as potential matches as well */ + if (isALPHA_FOLD_EQ(fc, 'I')) { + invlist = add_cp_to_invlist(invlist, + LATIN_SMALL_LETTER_DOTLESS_I); + invlist = add_cp_to_invlist(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { + invlist = add_cp_to_invlist(invlist, 'I'); + } + else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { + invlist = add_cp_to_invlist(invlist, 'i'); + } + } + } + } + + return invlist; +} + + +/* Mark that we cannot extend a found fixed substring at this point. + Update the longest found anchored substring or the longest found + floating substrings if needed. */ + +void +Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, + SSize_t *minlenp, int is_inf) +{ + const STRLEN l = CHR_SVLEN(data->last_found); + SV * const longest_sv = data->substrs[data->cur_is_floating].str; + const STRLEN old_l = CHR_SVLEN(longest_sv); + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_SCAN_COMMIT; + + if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { + const U8 i = data->cur_is_floating; + SvSetMagicSV(longest_sv, data->last_found); + data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; + + if (!i) /* fixed */ + data->substrs[0].max_offset = data->substrs[0].min_offset; + else { /* float */ + data->substrs[1].max_offset = + (is_inf) + ? OPTIMIZE_INFTY + : (l + ? data->last_start_max + : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min + ? OPTIMIZE_INFTY + : data->pos_min + data->pos_delta)); + } + + data->substrs[i].flags &= ~SF_BEFORE_EOL; + data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; + data->substrs[i].minlenp = minlenp; + data->substrs[i].lookbehind = 0; + } + + SvCUR_set(data->last_found, 0); + { + SV * const sv = data->last_found; + if (SvUTF8(sv) && SvMAGICAL(sv)) { + MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); + if (mg) + mg->mg_len = 0; + } + } + data->last_end = -1; + data->flags &= ~SF_BEFORE_EOL; + DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1); +} + +/* An SSC is just a regnode_charclass_posix with an extra field: the inversion + * list that describes which code points it matches */ + +STATIC void +S_ssc_anything(pTHX_ regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to match an empty string or any code point */ + + PERL_ARGS_ASSERT_SSC_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* mortalize so won't leak */ + ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); + ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ +} + +STATIC int +S_ssc_is_anything(const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' can match the empty string and any code + * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys + * us anything: if the function returns TRUE, 'ssc' hasn't been restricted + * in any way, so there's no point in using it */ + + UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_ANYTHING; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { + return FALSE; + } + + /* See if the list consists solely of the range 0 - Infinity */ + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (ret) { + return TRUE; + } + + /* If e.g., both \w and \W are set, matches everything */ + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + int i; + for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { + return TRUE; + } + } + } + + return FALSE; +} + +void +Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* Initializes the SSC 'ssc'. This includes setting it to match an empty + * string, any code point, or any posix class under locale */ + + PERL_ARGS_ASSERT_SSC_INIT; + + Zero(ssc, 1, regnode_ssc); + set_ANYOF_SYNTHETIC(ssc); + ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); + ssc_anything(ssc); + + /* If any portion of the regex is to operate under locale rules that aren't + * fully known at compile time, initialization includes it. The reason + * this isn't done for all regexes is that the optimizer was written under + * the assumption that locale was all-or-nothing. Given the complexity and + * lack of documentation in the optimizer, and that there are inadequate + * test cases for locale, many parts of it may not work properly, it is + * safest to avoid locale unless necessary. */ + if (RExC_contains_locale) { + ANYOF_POSIXL_SETALL(ssc); + } + else { + ANYOF_POSIXL_ZERO(ssc); + } +} + +STATIC int +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) +{ + /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only + * to the list of code points matched, and locale posix classes; hence does + * not check its flags) */ + + UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ + bool ret; + + PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + invlist_iterinit(ssc->invlist); + ret = invlist_iternext(ssc->invlist, &start, &end) + && start == 0 + && end == UV_MAX; + + invlist_iterfinish(ssc->invlist); + + if (! ret) { + return FALSE; + } + + if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { + return FALSE; + } + + return TRUE; +} + + +STATIC SV* +S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, + const regnode_charclass* const node) +{ + /* Returns a mortal inversion list defining which code points are matched + * by 'node', which is of ANYOF-ish type . Handles complementing the + * result if appropriate. If some code points aren't knowable at this + * time, the returned list must, and will, contain every code point that is + * a possibility. */ + + SV* invlist = NULL; + SV* only_utf8_locale_invlist = NULL; + bool new_node_has_latin1 = FALSE; + const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF) + ? ANYOF_FLAGS(node) + : 0; + + PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; + + /* Look at the data structure created by S_set_ANYOF_arg() */ + if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { + invlist = sv_2mortal(_new_invlist(1)); + invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); + } + else if (ANYOF_HAS_AUX(node)) { + const U32 n = ARG(node); + SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + + if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { + + /* Here there are things that won't be known until runtime -- we + * have to assume it could be anything */ + invlist = sv_2mortal(_new_invlist(1)); + return _add_range_to_invlist(invlist, 0, UV_MAX); + } + else if (ary[INVLIST_INDEX]) { + + /* Use the node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); + } + + /* Get the code points valid only under UTF-8 locales */ + if ( (flags & ANYOFL_FOLD) + && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) + { + only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; + } + } + + if (! invlist) { + invlist = sv_2mortal(_new_invlist(0)); + } + + /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS + * code points, and an inversion list for the others, but if there are code + * points that should match only conditionally on the target string being + * UTF-8, those are placed in the inversion list, and not the bitmap. + * Since there are circumstances under which they could match, they are + * included in the SSC. But if the ANYOF node is to be inverted, we have + * to exclude them here, so that when we invert below, the end result + * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We + * have to do this here before we add the unconditionally matched code + * points */ + if (flags & ANYOF_INVERT) { + _invlist_intersection_complement_2nd(invlist, + PL_UpperLatin1, + &invlist); + } + + /* Add in the points from the bit map */ + if (REGNODE_TYPE(OP(node)) == ANYOF){ + for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + unsigned int start = i++; + + for (; i < NUM_ANYOF_CODE_POINTS + && ANYOF_BITMAP_TEST(node, i); ++i) + { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); + new_node_has_latin1 = TRUE; + } + } + } + + /* If this can match all upper Latin1 code points, have to add them + * as well. But don't add them if inverting, as when that gets done below, + * it would exclude all these characters, including the ones it shouldn't + * that were added just above */ + if ( ! (flags & ANYOF_INVERT) + && OP(node) == ANYOFD + && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) + { + _invlist_union(invlist, PL_UpperLatin1, &invlist); + } + + /* Similarly for these */ + if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { + _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); + } + + if (flags & ANYOF_INVERT) { + _invlist_invert(invlist); + } + else if (flags & ANYOFL_FOLD) { + if (new_node_has_latin1) { + + /* These folds are potential in Turkic locales */ + if (_invlist_contains_cp(invlist, 'i')) { + invlist = add_cp_to_invlist(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + if (_invlist_contains_cp(invlist, 'I')) { + invlist = add_cp_to_invlist(invlist, + LATIN_SMALL_LETTER_DOTLESS_I); + } + + /* Under /li, any 0-255 could fold to any other 0-255, depending on + * the locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + } + else { + if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { + invlist = add_cp_to_invlist(invlist, 'I'); + } + if (_invlist_contains_cp(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) + { + invlist = add_cp_to_invlist(invlist, 'i'); + } + } + } + + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { + _invlist_union_maybe_complement_2nd(invlist, + only_utf8_locale_invlist, + flags & ANYOF_INVERT, + &invlist); + } + + return invlist; +} + +/* 'AND' a given class with another one. Can create false positives. 'ssc' + * should not be inverted. */ + +STATIC void +S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *and_with) +{ + /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either + * another SSC or a regular ANYOF class. Can create false positives. */ + + SV* anded_cp_list; + U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF) + ? ANYOF_FLAGS(and_with) + : 0; + U8 anded_flags; + + PERL_ARGS_ASSERT_SSC_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(and_with)) { + anded_cp_list = ((regnode_ssc *)and_with)->invlist; + anded_flags = and_with_flags; + + /* XXX This is a kludge around what appears to be deficiencies in the + * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, + * there are paths through the optimizer where it doesn't get weeded + * out when it should. And if we don't make some extra provision for + * it like the code just below, it doesn't get added when it should. + * This solution is to add it only when AND'ing, which is here, and + * only when what is being AND'ed is the pristine, original node + * matching anything. Thus it is like adding it to ssc_anything() but + * only when the result is to be AND'ed. Probably the same solution + * could be adopted for the same problem we have with /l matching, + * which is solved differently in S_ssc_init(), and that would lead to + * fewer false positives than that solution has. But if this solution + * creates bugs, the consequences are only that a warning isn't raised + * that should be; while the consequences for having /l bugs is + * incorrect matches */ + if (ssc_is_anything((regnode_ssc *)and_with)) { + anded_flags |= ANYOF_WARN_SUPER__shared; + } + } + else { + anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); + if (OP(and_with) == ANYOFD) { + anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; + } + else { + anded_flags = and_with_flags + & ( ANYOF_COMMON_FLAGS + |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared + |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); + if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) { + anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; + } + } + } + + ANYOF_FLAGS(ssc) &= anded_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'and-with'; P2, its posix classes. + * 'and_with' may be inverted. When not inverted, we have the situation of + * computing: + * (C1 | P1) & (C2 | P2) + * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) + * <= ((C1 & C2) | P1 | P2) + * Alternatively, the last few steps could be: + * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) + * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) + * <= (C1 | C2 | (P1 & P2)) + * We favor the second approach if either P1 or P2 is non-empty. This is + * because these components are a barrier to doing optimizations, as what + * they match cannot be known until the moment of matching as they are + * dependent on the current locale, 'AND"ing them likely will reduce or + * eliminate them. + * But we can do better if we know that C1,P1 are in their initial state (a + * frequent occurrence), each matching everything: + * (<everything>) & (C2 | P2) = C2 | P2 + * Similarly, if C2,P2 are in their initial state (again a frequent + * occurrence), the result is a no-op + * (C1 | P1) & (<everything>) = C1 | P1 + * + * Inverted, we have + * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) + * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) + * <= (C1 & ~C2) | (P1 & ~P2) + * */ + + if ((and_with_flags & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(and_with)) + { + unsigned int i; + + ssc_intersection(ssc, + anded_cp_list, + FALSE /* Has already been inverted */ + ); + + /* If either P1 or P2 is empty, the intersection will be also; can skip + * the loop */ + if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { + ANYOF_POSIXL_ZERO(ssc); + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + + /* Note that the Posix class component P from 'and_with' actually + * looks like: + * P = Pa | Pb | ... | Pn + * where each component is one posix class, such as in [\w\s]. + * Thus + * ~P = ~(Pa | Pb | ... | Pn) + * = ~Pa & ~Pb & ... & ~Pn + * <= ~Pa | ~Pb | ... | ~Pn + * The last is something we can easily calculate, but unfortunately + * is likely to have many false positives. We could do better + * in some (but certainly not all) instances if two classes in + * P have known relationships. For example + * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: + * So + * :lower: & :print: = :lower: + * And similarly for classes that must be disjoint. For example, + * since \s and \w can have no elements in common based on rules in + * the POSIX standard, + * \w & ^\S = nothing + * Unfortunately, some vendor locales do not meet the Posix + * standard, in particular almost everything by Microsoft. + * The loop below just changes e.g., \w into \W and vice versa */ + + regnode_charclass_posixl temp; + int add = 1; /* To calculate the index of the complement */ + + Zero(&temp, 1, regnode_charclass_posixl); + ANYOF_POSIXL_ZERO(&temp); + for (i = 0; i < ANYOF_MAX; i++) { + assert(i % 2 != 0 + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) + || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); + + if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { + ANYOF_POSIXL_SET(&temp, i + add); + } + add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ + } + ANYOF_POSIXL_AND(&temp, ssc); + + } /* else ssc already has no posixes */ + } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC + in its initial state */ + else if (! is_ANYOF_SYNTHETIC(and_with) + || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) + { + /* But if 'ssc' is in its initial state, the result is just 'and_with'; + * copy it over 'ssc' */ + if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { + if (is_ANYOF_SYNTHETIC(and_with)) { + StructCopy(and_with, ssc, regnode_ssc); + } + else { + ssc->invlist = anded_cp_list; + ANYOF_POSIXL_ZERO(ssc); + if (and_with_flags & ANYOF_MATCHES_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); + } + } + } + else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) + || (and_with_flags & ANYOF_MATCHES_POSIXL)) + { + /* One or the other of P1, P2 is non-empty. */ + if (and_with_flags & ANYOF_MATCHES_POSIXL) { + ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); + } + ssc_union(ssc, anded_cp_list, FALSE); + } + else { /* P1 = P2 = empty */ + ssc_intersection(ssc, anded_cp_list, FALSE); + } + } +} + +STATIC void +S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, + const regnode_charclass *or_with) +{ + /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either + * another SSC or a regular ANYOF class. Can create false positives if + * 'or_with' is to be inverted. */ + + SV* ored_cp_list; + U8 ored_flags; + U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF) + ? ANYOF_FLAGS(or_with) + : 0; + + PERL_ARGS_ASSERT_SSC_OR; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract + * the code point inversion list and just the relevant flags */ + if (is_ANYOF_SYNTHETIC(or_with)) { + ored_cp_list = ((regnode_ssc*) or_with)->invlist; + ored_flags = or_with_flags; + } + else { + ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); + ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; + if (OP(or_with) != ANYOFD) { + ored_flags |= + or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared + |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); + if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) { + ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; + } + } + } + + ANYOF_FLAGS(ssc) |= ored_flags; + + /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. + * C2 is the list of code points in 'or-with'; P2, its posix classes. + * 'or_with' may be inverted. When not inverted, we have the simple + * situation of computing: + * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) + * If P1|P2 yields a situation with both a class and its complement are + * set, like having both \w and \W, this matches all code points, and we + * can delete these from the P component of the ssc going forward. XXX We + * might be able to delete all the P components, but I (khw) am not certain + * about this, and it is better to be safe. + * + * Inverted, we have + * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) + * <= (C1 | P1) | ~C2 + * <= (C1 | ~C2) | P1 + * (which results in actually simpler code than the non-inverted case) + * */ + + if ((or_with_flags & ANYOF_INVERT) + && ! is_ANYOF_SYNTHETIC(or_with)) + { + /* We ignore P2, leaving P1 going forward */ + } /* else Not inverted */ + else if (or_with_flags & ANYOF_MATCHES_POSIXL) { + ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + unsigned int i; + for (i = 0; i < ANYOF_MAX; i += 2) { + if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) + { + ssc_match_all_cp(ssc); + ANYOF_POSIXL_CLEAR(ssc, i); + ANYOF_POSIXL_CLEAR(ssc, i+1); + } + } + } + } + + ssc_union(ssc, + ored_cp_list, + FALSE /* Already has been inverted */ + ); +} + +STATIC void +S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_UNION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_union_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +STATIC void +S_ssc_intersection(pTHX_ regnode_ssc *ssc, + SV* const invlist, + const bool invert2nd) +{ + PERL_ARGS_ASSERT_SSC_INTERSECTION; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + _invlist_intersection_maybe_complement_2nd(ssc->invlist, + invlist, + invert2nd, + &ssc->invlist); +} + +STATIC void +S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) +{ + PERL_ARGS_ASSERT_SSC_ADD_RANGE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); +} + +STATIC void +S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) +{ + /* AND just the single code point 'cp' into the SSC 'ssc' */ + + SV* cp_list = _new_invlist(2); + + PERL_ARGS_ASSERT_SSC_CP_AND; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + cp_list = add_cp_to_invlist(cp_list, cp); + ssc_intersection(ssc, cp_list, + FALSE /* Not inverted */ + ); + SvREFCNT_dec_NN(cp_list); +} + +STATIC void +S_ssc_clear_locale(regnode_ssc *ssc) +{ + /* Set the SSC 'ssc' to not match any locale things */ + PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + ANYOF_POSIXL_ZERO(ssc); + ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; +} + +bool +Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) +{ + /* The synthetic start class is used to hopefully quickly winnow down + * places where a pattern could start a match in the target string. If it + * doesn't really narrow things down that much, there isn't much point to + * having the overhead of using it. This function uses some very crude + * heuristics to decide if to use the ssc or not. + * + * It returns TRUE if 'ssc' rules out more than half what it considers to + * be the "likely" possible matches, but of course it doesn't know what the + * actual things being matched are going to be; these are only guesses + * + * For /l matches, it assumes that the only likely matches are going to be + * in the 0-255 range, uniformly distributed, so half of that is 127 + * For /a and /d matches, it assumes that the likely matches will be just + * the ASCII range, so half of that is 63 + * For /u and there isn't anything matching above the Latin1 range, it + * assumes that that is the only range likely to be matched, and uses + * half that as the cut-off: 127. If anything matches above Latin1, + * it assumes that all of Unicode could match (uniformly), except for + * non-Unicode code points and things in the General Category "Other" + * (unassigned, private use, surrogates, controls and formats). This + * is a much large number. */ + + U32 count = 0; /* Running total of number of code points matched by + 'ssc' */ + UV start, end; /* Start and end points of current range in inversion + XXX outdated. UTF-8 locales are common, what about invert? list */ + const U32 max_code_points = (LOC) + ? 256 + : (( ! UNI_SEMANTICS + || invlist_highest(ssc->invlist) < 256) + ? 128 + : NON_OTHER_COUNT); + const U32 max_match = max_code_points / 2; + + PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; + + invlist_iterinit(ssc->invlist); + while (invlist_iternext(ssc->invlist, &start, &end)) { + if (start >= max_code_points) { + break; + } + end = MIN(end, max_code_points - 1); + count += end - start + 1; + if (count >= max_match) { + invlist_iterfinish(ssc->invlist); + return FALSE; + } + } + + return TRUE; +} + + +void +Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) +{ + /* The inversion list in the SSC is marked mortal; now we need a more + * permanent copy, which is stored the same way that is done in a regular + * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit + * map */ + + SV* invlist = invlist_clone(ssc->invlist, NULL); + + PERL_ARGS_ASSERT_SSC_FINALIZE; + + assert(is_ANYOF_SYNTHETIC(ssc)); + + /* The code in this file assumes that all but these flags aren't relevant + * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared + * by the time we reach here */ + assert(! (ANYOF_FLAGS(ssc) + & ~( ANYOF_COMMON_FLAGS + |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared + |ANYOF_HAS_EXTRA_RUNTIME_MATCHES))); + + populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist); + + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); + SvREFCNT_dec(invlist); + + /* Make sure is clone-safe */ + ssc->invlist = NULL; + + if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { + ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; + OP(ssc) = ANYOFPOSIXL; + } + else if (RExC_contains_locale) { + OP(ssc) = ANYOFL; + } + + assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); +} + +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are compatible node types + * + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and + * these get optimized out + * + * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full + * as possible, even if that means splitting an existing node so that its first + * part is moved to the preceding node. This would maximise the efficiency of + * memEQ during matching. + * + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta number of characters of the + * input nodes. + * + * And *unfolded_multi_char is set to indicate whether or not the node contains + * an unfolded multi-char fold. This happens when it won't be known until + * runtime whether the fold is valid or not; namely + * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the + * target string being matched against turns out to be UTF-8 is that fold + * valid; or + * 2) for EXACTFL nodes whose folding rules depend on the locale in force at + * runtime. + * (Multi-char folds whose components are all above the Latin1 range are not + * run-time locale dependent, and have already been folded by the time this + * function is called.) + * + * This is as good a place as any to discuss the design of handling these + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't always work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both sides fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFUP node + * that is "sss" in this case. + * + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases. The + * approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character folded sequences. Since a single character can fold into + * such a sequence, the minimum match length for this node is less than + * the number of characters in the node. This routine returns in + * *min_subtract how many characters to subtract from the actual + * length of the string to get a real minimum match length; it is 0 if + * there are no multi-char foldeds. This delta is used by the caller to + * adjust the min length of the match, and the delta between min and max, + * so that the optimizer doesn't reject these possibilities based on size + * constraints. + * + * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) + * under /u, we fold it to 'ss' in regatom(), and in this routine, after + * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 + * EXACTFU nodes. The node type of such nodes is then changed to + * EXACTFUP, indicating it is problematic, and needs careful handling. + * (The procedures in step 1) above are sufficient to handle this case in + * UTF-8 encoded nodes.) The reason this is problematic is that this is + * the only case where there is a possible fold length change in non-UTF-8 + * patterns. By reserving a special node type for problematic cases, the + * far more common regular EXACTFU nodes can be processed faster. + * regexec.c takes advantage of this. + * + * EXACTFUP has been created as a grab-bag for (hopefully uncommon) + * problematic cases. These all only occur when the pattern is not + * UTF-8. In addition to the 'ss' sequence where there is a possible fold + * length change, it handles the situation where the string cannot be + * entirely folded. The strings in an EXACTFish node are folded as much + * as possible during compilation in regcomp.c. This saves effort in + * regex matching. By using an EXACTFUP node when it is not possible to + * fully fold at compile time, regexec.c can know that everything in an + * EXACTFU node is folded, so folding can be skipped at runtime. The only + * case where folding in EXACTFU nodes can't be done at compile time is + * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This + * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes + * handle two very different cases. Alternatively, there could have been + * a node type where there are length changes, one for unfolded, and one + * for both. If yet another special case needed to be created, the number + * of required node types would have to go to 7. khw figures that even + * though there are plenty of node types to spare, that the maintenance + * cost wasn't worth the small speedup of doing it that way, especially + * since he thinks the MICRO SIGN is rarely encountered in practice. + * + * There are other cases where folding isn't done at compile time, but + * none of them are under /u, and hence not for EXACTFU nodes. The folds + * in EXACTFL nodes aren't known until runtime, and vary as the locale + * changes. Some folds in EXACTF depend on if the runtime target string + * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di + * when no fold in it depends on the UTF-8ness of the target string.) + * + * 3) A problem remains for unfolded multi-char folds. (These occur when the + * validity of the fold won't be known until runtime, and so must remain + * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA + * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot + * be an EXACTF node with a UTF-8 pattern.) They also occur for various + * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) + * The reason this is a problem is that the optimizer part of regexec.c + * (probably unwittingly, in Perl_regexec_flags()) makes an assumption + * that a character in the pattern corresponds to at most a single + * character in the target string. (And I do mean character, and not byte + * here, unlike other parts of the documentation that have never been + * updated to account for multibyte Unicode.) Sharp s in EXACTF and + * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA + * nodes it can match "\x{17F}\x{17F}". These, along with other ones in + * EXACTFL nodes, violate the assumption, and they are the only instances + * where it is violated. I'm reluctant to try to change the assumption, + * as the code involved is impenetrable to me (khw), so instead the code + * here punts. This routine examines EXACTFL nodes, and (when the pattern + * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a + * boolean indicating whether or not the node contains such a fold. When + * it is true, the caller sets a flag that later causes the optimizer in + * this file to not set values for the floating and fixed string lengths, + * and thus avoids the optimizer code in regexec.c that makes the invalid + * assumption. Thus, there is no optimization based on string lengths for + * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern + * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the + * assumption is wrong only in these cases is that all other non-UTF-8 + * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to + * their expanded versions. (Again, we can't prefold sharp s to 'ss' in + * EXACTF nodes because we don't know at compile time if it actually + * matches 'ss' or not. For EXACTF nodes it will match iff the target + * string is in UTF-8. This is in contrast to EXACTFU nodes, where it + * always matches; and EXACTFAA where it never does. In an EXACTFAA node + * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the + * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 + * string would require the pattern to be forced into UTF-8, the overhead + * of which we want to avoid. Similarly the unfolded multi-char folds in + * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 + * locale.) + * + * Similarly, the code that generates tries doesn't currently handle + * not-already-folded multi-char folds, and it looks like a pain to change + * that. Therefore, trie generation of EXACTFAA nodes with the sharp s + * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, + * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people + * using /iaa matching will be doing so almost entirely with ASCII + * strings, so this should rarely be encountered in practice */ + +U32 +Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, + UV *min_subtract, bool *unfolded_multi_char, + U32 flags, regnode *val, U32 depth) +{ + /* Merge several consecutive EXACTish nodes into one. */ + + regnode *n = regnext(scan); + U32 stringok = 1; + regnode *next = REGNODE_AFTER_varies(scan); + U32 merged = 0; + U32 stopnow = 0; +#ifdef DEBUGGING + regnode *stop = scan; + DECLARE_AND_GET_RE_DEBUG_FLAGS; +#else + PERL_UNUSED_ARG(depth); +#endif + + PERL_ARGS_ASSERT_JOIN_EXACT; +#ifndef EXPERIMENTAL_INPLACESCAN + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(val); +#endif + DEBUG_PEEP("join", scan, depth, 0); + + assert(REGNODE_TYPE(OP(scan)) == EXACT); + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while ( n + && ( REGNODE_TYPE(OP(n)) == NOTHING + || (stringok && REGNODE_TYPE(OP(n)) == EXACT)) + && NEXT_OFF(n) + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { + + if (OP(n) == TAIL || n > next) + stringok = 0; + if (REGNODE_TYPE(OP(n)) == NOTHING) { + DEBUG_PEEP("skip:", n, depth, 0); + NEXT_OFF(scan) += NEXT_OFF(n); + next = n + NODE_STEP_REGNODE; +#ifdef DEBUGGING + if (stringok) + stop = n; +#endif + n = regnext(n); + } + else if (stringok) { + const unsigned int oldl = STR_LEN(scan); + regnode * const nnext = regnext(n); + + /* XXX I (khw) kind of doubt that this works on platforms (should + * Perl ever run on one) where U8_MAX is above 255 because of lots + * of other assumptions */ + /* Don't join if the sum can't fit into a single node */ + if (oldl + STR_LEN(n) > U8_MAX) + break; + + /* Joining something that requires UTF-8 with something that + * doesn't, means the result requires UTF-8. */ + if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { + OP(scan) = EXACT_REQ8; + } + else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { + ; /* join is compatible, no need to change OP */ + } + else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { + OP(scan) = EXACTFU_REQ8; + } + else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { + ; /* join is compatible, no need to change OP */ + } + else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { + ; /* join is compatible, no need to change OP */ + } + else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { + + /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, + * which can join with EXACTFU ones. We check for this case + * here. These need to be resolved to either EXACTFU or + * EXACTF at joining time. They have nothing in them that + * would forbid them from being the more desirable EXACTFU + * nodes except that they begin and/or end with a single [Ss]. + * The reason this is problematic is because they could be + * joined in this loop with an adjacent node that ends and/or + * begins with [Ss] which would then form the sequence 'ss', + * which matches differently under /di than /ui, in which case + * EXACTFU can't be used. If the 'ss' sequence doesn't get + * formed, the nodes get absorbed into any adjacent EXACTFU + * node. And if the only adjacent node is EXACTF, they get + * absorbed into that, under the theory that a longer node is + * better than two shorter ones, even if one is EXACTFU. Note + * that EXACTFU_REQ8 is generated only for UTF-8 patterns, + * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ + + if (STRING(n)[STR_LEN(n)-1] == 's') { + + /* Here the joined node would end with 's'. If the node + * following the combination is an EXACTF one, it's better to + * join this trailing edge 's' node with that one, leaving the + * current one in 'scan' be the more desirable EXACTFU */ + if (OP(nnext) == EXACTF) { + break; + } + + OP(scan) = EXACTFU_S_EDGE; + + } /* Otherwise, the beginning 's' of the 2nd node just + becomes an interior 's' in 'scan' */ + } + else if (OP(scan) == EXACTF && OP(n) == EXACTF) { + ; /* join is compatible, no need to change OP */ + } + else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { + + /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE + * nodes. But the latter nodes can be also joined with EXACTFU + * ones, and that is a better outcome, so if the node following + * 'n' is EXACTFU, quit now so that those two can be joined + * later */ + if (OP(nnext) == EXACTFU) { + break; + } + + /* The join is compatible, and the combined node will be + * EXACTF. (These don't care if they begin or end with 's' */ + } + else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { + if ( STRING(scan)[STR_LEN(scan)-1] == 's' + && STRING(n)[0] == 's') + { + /* When combined, we have the sequence 'ss', which means we + * have to remain /di */ + OP(scan) = EXACTF; + } + } + else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { + if (STRING(n)[0] == 's') { + ; /* Here the join is compatible and the combined node + starts with 's', no need to change OP */ + } + else { /* Now the trailing 's' is in the interior */ + OP(scan) = EXACTFU; + } + } + else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { + + /* The join is compatible, and the combined node will be + * EXACTF. (These don't care if they begin or end with 's' */ + OP(scan) = EXACTF; + } + else if (OP(scan) != OP(n)) { + + /* The only other compatible joinings are the same node type */ + break; + } + + DEBUG_PEEP("merg", n, depth, 0); + merged++; + + next = REGNODE_AFTER_varies(n); + NEXT_OFF(scan) += NEXT_OFF(n); + assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); + setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); + /* Now we can overwrite *n : */ + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); +#ifdef DEBUGGING + stop = next - 1; +#endif + n = nnext; + if (stopnow) break; + } + +#ifdef EXPERIMENTAL_INPLACESCAN + if (flags && !NEXT_OFF(n)) { + DEBUG_PEEP("atch", val, depth, 0); + if (REGNODE_OFF_BY_ARG(OP(n))) { + ARG_SET(n, val - n); + } + else { + NEXT_OFF(n) = val - n; + } + stopnow = 1; + } +#endif + } + + /* This temporary node can now be turned into EXACTFU, and must, as + * regexec.c doesn't handle it */ + if (OP(scan) == EXACTFU_S_EDGE) { + OP(scan) = EXACTFU; + } + + *min_subtract = 0; + *unfolded_multi_char = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { + U8* s0 = (U8*) STRING(scan); + U8* s = s0; + U8* s_end = s0 + STR_LEN(scan); + + int total_count_delta = 0; /* Total delta number of characters that + multi-char folds expand to */ + + /* One pass is made over the node's string looking for all the + * possibilities. To avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + U8* folded = NULL; + + if (OP(scan) == EXACTFL) { + U8 *d; + + /* An EXACTFL node would already have been changed to another + * node type unless there is at least one character in it that + * is problematic; likely a character whose fold definition + * won't be known until runtime, and so has yet to be folded. + * For all but the UTF-8 locale, folds are 1-1 in length, but + * to handle the UTF-8 case, we need to create a temporary + * folded copy using UTF-8 locale rules in order to analyze it. + * This is because our macros that look to see if a sequence is + * a multi-char fold assume everything is folded (otherwise the + * tests in those macros would be too complicated and slow). + * Note that here, the non-problematic folds will have already + * been done, so we can just copy such characters. We actually + * don't completely fold the EXACTFL string. We skip the + * unfolded multi-char folds, as that would just create work + * below to figure out the size they already are */ + + Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); + d = folded; + while (s < s_end) { + STRLEN s_len = UTF8SKIP(s); + if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { + Copy(s, d, s_len, U8); + d += s_len; + } + else if (is_FOLDS_TO_MULTI_utf8(s)) { + *unfolded_multi_char = TRUE; + Copy(s, d, s_len, U8); + d += s_len; + } + else if (isASCII(*s)) { + *(d++) = toFOLD(*s); + } + else { + STRLEN len; + _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); + d += len; + } + s += s_len; + } + + /* Point the remainder of the routine to look at our temporary + * folded copy */ + s = folded; + s_end = d; + } /* End of creating folded copy of EXACTFL string */ + + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ + { + int count = 0; /* How many characters in a multi-char fold */ + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } + + { /* Here is a generic multi-char fold. */ + U8* multi_end = s + len; + + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ + if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } + + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + total_count_delta += count - 1; + next_iteration: ; + } + + /* We created a temporary folded copy of the string in EXACTFL + * nodes. Therefore we need to be sure it doesn't go below zero, + * as the real string could be shorter */ + if (OP(scan) == EXACTFL) { + int total_chars = utf8_length((U8*) STRING(scan), + (U8*) STRING(scan) + STR_LEN(scan)); + if (total_count_delta > total_chars) { + total_count_delta = total_chars; + } + } + + *min_subtract += total_count_delta; + Safefree(folded); + } + else if (OP(scan) == EXACTFAA) { + + /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char + * fold to the ASCII range (and there are no existing ones in the + * upper latin1 range). But, as outlined in the comments preceding + * this function, we need to flag any occurrences of the sharp s. + * This character forbids trie formation (because of added + * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + while (s < s_end) { + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + OP(scan) = EXACTFAA_NO_TRIE; + *unfolded_multi_char = TRUE; + break; + } + s++; + } + } + else if (OP(scan) != EXACTFAA_NO_TRIE) { + + /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char + * folds that are all Latin1. As explained in the comments + * preceding this function, we look also for the sharp s in EXACTF + * and EXACTFL nodes; it can be in the final position. Otherwise + * we can stop looking 1 byte earlier because have to find at least + * two characters for a multi-fold */ + const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) + ? s_end + : s_end -1; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S + && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) + { + *unfolded_multi_char = TRUE; + } + s++; + continue; + } + + if (len == 2 + && isALPHA_FOLD_EQ(*s, 's') + && isALPHA_FOLD_EQ(*(s+1), 's')) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is in UTF-8, + * which we don't know until runtime. EXACTFL nodes can't + * transform into EXACTFU nodes */ + if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { + OP(scan) = EXACTFUP; + } + } + + *min_subtract += len - 1; + s += len; + } +#endif + } + } + +#ifdef DEBUGGING + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ + n = REGNODE_AFTER_varies(scan); + while (n <= stop) { + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; + n++; + } +#endif + DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); + return stopnow; +} + +/* REx optimizer. Converts nodes into quicker variants "in place". + Finds fixed substrings. */ + + +/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set + to the position after last scanned or to NULL. */ + +/* the return from this sub is the minimum length that could possibly match */ +SSize_t +Perl_study_chunk(pTHX_ + RExC_state_t *pRExC_state, + regnode **scanp, /* Start here (read-write). */ + SSize_t *minlenp, /* used for the minlen of substrings? */ + SSize_t *deltap, /* Write maxlen-minlen here. */ + regnode *last, /* Stop before this one. */ + scan_data_t *data, /* string data about the pattern */ + I32 stopparen, /* treat CLOSE-N as END, see GOSUB */ + U32 recursed_depth, /* how deep have we recursed via GOSUB */ + regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */ + U32 flags, /* flags controlling this call, see SCF_ flags */ + U32 depth, /* how deep have we recursed period */ + bool was_mutate_ok /* TRUE if in-place optimizations are allowed. + FALSE only if the caller (recursively) was + prohibited from modifying the regops, because + a higher caller is holding a ptr to them. */ +) +{ + /* vars about the regnodes we are working with */ + regnode *scan = *scanp; /* the current opcode we are inspecting */ + regnode *next = NULL; /* the next opcode beyond scan, tmp var */ + regnode *first_non_open = scan; /* FIXME: should this init to NULL? + the first non open regop, if the init + val IS an OPEN then we will skip past + it just after the var decls section */ + I32 code = 0; /* temp var used to hold the optype of a regop */ + + /* vars about the min and max length of the pattern */ + SSize_t min = 0; /* min length of this part of the pattern */ + SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT + this is adjusted down if we find + an ACCEPT */ + SSize_t delta = 0; /* difference between min and max length + (not accounting for stopmin) */ + + /* vars about capture buffers in the pattern */ + I32 pars = 0; /* count of OPEN opcodes */ + I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */ + + /* vars about whether this pattern contains something that can match + * infinitely long strings, eg, X* or X+ */ + int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ + + /* scan_data_t (struct) is used to hold information about the substrings + * and start class we have extracted from the string */ + scan_data_t data_fake; /* temp var used for recursing in some cases */ + + SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do + trie optimizations */ + + scan_frame *frame = NULL; /* used as part of fake recursion */ + + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_STUDY_CHUNK; + RExC_study_started= 1; + + Zero(&data_fake, 1, scan_data_t); + + if ( depth == 0 ) { + while (first_non_open && OP(first_non_open) == OPEN) + first_non_open=regnext(first_non_open); + } + + fake_study_recurse: + DEBUG_r( + RExC_study_chunk_recursed_count++; + ); + DEBUG_OPTIMISE_MORE_r( + { + Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", + depth, (long)stopparen, + (unsigned long)RExC_study_chunk_recursed_count, + (unsigned long)depth, (unsigned long)recursed_depth, + scan, + last); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { + if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { + Perl_re_printf( aTHX_ " %d",(int)i); + break; + } + } + if ( j + 1 < recursed_depth ) { + Perl_re_printf( aTHX_ ","); + } + } + } + Perl_re_printf( aTHX_ "\n"); + } + ); + while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ + bool unfolded_multi_char = FALSE; + /* avoid mutating ops if we are anywhere within the recursed or + * enframed handling for a GOSUB: the outermost level will handle it. + */ + bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); + /* Peephole optimizer: */ + DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta); + DEBUG_PEEP("Peep", scan, depth, flags); + + + /* The reason we do this here is that we need to deal with things like + * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT + * parsing code, as each (?:..) is handled by a different invocation of + * reg() -- Yves + */ + if (REGNODE_TYPE(OP(scan)) == EXACT + && OP(scan) != LEXACT + && OP(scan) != LEXACT_REQ8 + && mutate_ok + ) { + join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, + 0, NULL, depth + 1); + } + + /* Follow the next-chain of the current node and optimize + away all the NOTHINGs from it. + */ + rck_elide_nothing(scan); + + /* The principal pseudo-switch. Cannot be a switch, since we look into + * several different things. */ + if ( OP(scan) == DEFINEP ) { + SSize_t minlen = 0; + SSize_t deltanext = 0; + SSize_t fake_last_close = 0; + regnode *fake_last_close_op = NULL; + U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY); + + StructCopy(&zero_scan_data, &data_fake, scan_data_t); + scan = regnext(scan); + assert( OP(scan) == IFTHEN ); + DEBUG_PEEP("expect IFTHEN", scan, depth, flags); + + data_fake.last_closep= &fake_last_close; + data_fake.last_close_opp= &fake_last_close_op; + minlen = *minlenp; + next = regnext(scan); + scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN); + DEBUG_PEEP("scan", scan, depth, flags); + DEBUG_PEEP("next", next, depth, flags); + + /* we suppose the run is continuous, last=next... + * NOTE we dont use the return here! */ + /* DEFINEP study_chunk() recursion */ + (void)study_chunk(pRExC_state, &scan, &minlen, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1, mutate_ok); + + scan = next; + } else + if ( + OP(scan) == BRANCH || + OP(scan) == BRANCHJ || + OP(scan) == IFTHEN + ) { + next = regnext(scan); + code = OP(scan); + + /* The op(next)==code check below is to see if we + * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" + * IFTHEN is special as it might not appear in pairs. + * Not sure whether BRANCH-BRANCHJ is possible, regardless + * we dont handle it cleanly. */ + if (OP(next) == code || code == IFTHEN) { + /* NOTE - There is similar code to this block below for + * handling TRIE nodes on a re-study. If you change stuff here + * check there too. */ + SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0; + regnode_ssc accum; + regnode * const startbranch=scan; + + if (flags & SCF_DO_SUBSTR) { + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + while (OP(scan) == code) { + SSize_t deltanext, minnext, fake_last_close = 0; + regnode *fake_last_close_op = NULL; + U32 f = (flags & SCF_TRIE_DOING_RESTUDY); + regnode_ssc this_class; + + DEBUG_PEEP("Branch", scan, depth, flags); + + num++; + StructCopy(&zero_scan_data, &data_fake, scan_data_t); + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + data_fake.last_close_opp = data->last_close_opp; + } + else { + data_fake.last_closep = &fake_last_close; + data_fake.last_close_opp = &fake_last_close_op; + } + + data_fake.pos_delta = delta; + next = regnext(scan); + + scan = REGNODE_AFTER_opcode(scan, code); + + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + /* we suppose the run is continuous, last=next...*/ + /* recurse study_chunk() for each BRANCH in an alternation */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1, + mutate_ok); + + if (min1 > minnext) + min1 = minnext; + if (deltanext == OPTIMIZE_INFTY) { + is_inf = is_inf_internal = 1; + max1 = OPTIMIZE_INFTY; + } else if (max1 < minnext + deltanext) + max1 = minnext + deltanext; + scan = next; + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > minnext) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); + DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta); + } + if (code == IFTHEN && num < 2) /* Empty ELSE branch */ + min1 = 0; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1)) + data->pos_delta = OPTIMIZE_INFTY; + else + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->cur_is_floating = 1; + } + min += min1; + if (delta == OPTIMIZE_INFTY + || OPTIMIZE_INFTY - delta - (max1 - min1) < 0) + delta = OPTIMIZE_INFTY; + else + delta += max1 - min1; + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta); + + if (PERL_ENABLE_TRIE_OPTIMISATION + && OP(startbranch) == BRANCH + && mutate_ok + ) { + /* demq. + + Assuming this was/is a branch we are dealing with: 'scan' + now points at the item that follows the branch sequence, + whatever it is. We now start at the beginning of the + sequence and look for subsequences of + + BRANCH->EXACT=>x1 + BRANCH->EXACT=>x2 + tail + + which would be constructed from a pattern like + /A|LIST|OF|WORDS/ + + If we can find such a subsequence we need to turn the first + element into a trie and then add the subsequent branch exact + strings to the trie. + + We have two cases + + 1. patterns where the whole set of branches can be + converted. + + 2. patterns where only a subset can be converted. + + In case 1 we can replace the whole set with a single regop + for the trie. In case 2 we need to keep the start and end + branches so + + 'BRANCH EXACT; BRANCH EXACT; BRANCH X' + becomes BRANCH TRIE; BRANCH X; + + There is an additional case, that being where there is a + common prefix, which gets split out into an EXACT like node + preceding the TRIE node. + + If X(1..n)==tail then we can do a simple trie, if not we make + a "jump" trie, such that when we match the appropriate word + we "jump" to the appropriate tail node. Essentially we turn + a nested if into a case structure of sorts. + + */ + + int made=0; + if (!re_trie_maxbuff) { + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + if (!SvIOK(re_trie_maxbuff)) + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + if ( SvIV(re_trie_maxbuff)>=0 ) { + regnode *cur; + regnode *first = (regnode *)NULL; + regnode *prev = (regnode *)NULL; + regnode *tail = scan; + U8 trietype = 0; + U32 count=0; + + /* var tail is used because there may be a TAIL + regop in the way. Ie, the exacts will point to the + thing following the TAIL, but the last branch will + point at the TAIL. So we advance tail. If we + have nested (?:) we may have to move through several + tails. + */ + + while ( OP( tail ) == TAIL ) { + /* this is the TAIL generated by (?:) */ + tail = regnext( tail ); + } + + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", + depth+1, + "Looking for TRIE'able sequences. Tail node is ", + (UV) REGNODE_OFFSET(tail), + SvPV_nolen_const( RExC_mysv ) + ); + }); + + /* + + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part + of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this + /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also + support building with NOJUMPTRIE, which restricts + the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is + a possible optimization target. If we are building + under NOJUMPTRIE then we require that noper_next is + the same as scan (our current position in the regex + program). + + Once we have two or more consecutive such branches + we can create a trie of the EXACT's contents and + stitch it in place into the program. + + If the sequence represents all of the branches in + the alternation we replace the entire thing with a + single TRIE node. + + Otherwise when it is a subsequence we need to + stitch it in place and replace only the relevant + branches. This means the first branch has to remain + as it is used by the alternation logic, and its + next pointer, and needs to be repointed at the item + on the branch chain following the last branch we + have optimized away. + + This could be either a BRANCH, in which case the + subsequence is internal, or it could be the item + following the branch sequence in which case the + subsequence is at the end (which does not + necessarily mean the first node is the start of the + alternation). + + TRIE_TYPE(X) is a define which maps the optype to a + trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACT_REQ8 | EXACT + EXACTFU | EXACTFU + EXACTFU_REQ8 | EXACTFU + EXACTFUP | EXACTFU + EXACTFAA | EXACTFAA + EXACTL | EXACTL + EXACTFLU8 | EXACTFLU8 + + + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ + ? NOTHING \ + : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ + ? EXACT \ + : ( EXACTFU == (X) \ + || EXACTFU_REQ8 == (X) \ + || EXACTFUP == (X) ) \ + ? EXACTFU \ + : ( EXACTFAA == (X) ) \ + ? EXACTFAA \ + : ( EXACTL == (X) ) \ + ? EXACTL \ + : ( EXACTFLU8 == (X) ) \ + ? EXACTFLU8 \ + : 0 ) + + /* dont use tail as the end marker for this traverse */ + for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { + regnode * const noper = REGNODE_AFTER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); +#if defined(DEBUGGING) || defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; +#endif + + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %d:%s (%d)", + depth+1, + REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); + + regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); + Perl_re_printf( aTHX_ " -> %d:%s", + REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); + + if ( noper_next ) { + regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); + Perl_re_printf( aTHX_ "\t=> %d:%s\t", + REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); + } + Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), + REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype) + ); + }); + + /* Is noper a trieable nodetype that can be merged + * with the current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING ) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) +#ifdef NOJUMPTRIE + && noper_next >= tail +#endif + && count < U16_MAX) + { + /* Handle mergable triable node Either we are + * the first node in a new trieable sequence, + * in which case we do some bookkeeping, + * otherwise we update the end pointer. */ + if ( !first ) { + first = cur; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. + * We need at least two for a trie + * so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } + } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; + prev = cur; + } + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can + * not be tried together with the current trie, + * or a non triable node */ + if ( prev ) { + /* If last is set and trietype is not + * NOTHING then we have found at least two + * triable branch sequences in a row of a + * similar trietype so we can turn them + * into a trie. If/when we allow NOTHING to + * start a trie sequence this condition + * will be required, and it isn't expensive + * so we leave it in for now. */ + if ( trietype && trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, + count, trietype, depth+1 ); + prev = NULL; /* note: we clear/update + first, trietype etc below, + so we dont do it here */ + } + if ( noper_trietype +#ifdef NOJUMPTRIE + && noper_next >= tail +#endif + ){ + /* noper is triable, so we can start a new + * trie sequence */ + count = 1; + first = cur; + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the + * current node is not triable then we have + * to reset the first information. */ + count = 0; + first = NULL; + trietype = 0; + } + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", + depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); + Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), + REGNODE_NAME(trietype) + ); + + }); + if ( prev && trietype ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of + * a trie, so we have to construct it here + * outside of the loop */ + made= make_trie( pRExC_state, startbranch, + first, scan, tail, count, + trietype, depth+1 ); +#ifdef TRIE_STUDY_OPT + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan >= tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; + } + } +#endif + } else { + /* at this point we know whatever we have is a + * NOTHING sequence/branch AND if 'startbranch' + * is 'first' then we can turn the whole thing + * into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, + * something like this: (?:|) So we can + * turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", + depth+1, + SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( prev) */ + } /* TRIE_MAXBUF is non zero */ + } /* do trie */ + DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta); + } + else + scan = REGNODE_AFTER_opcode(scan,code); + continue; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { + I32 paren = 0; + regnode *start = NULL; + regnode *end = NULL; + U32 my_recursed_depth= recursed_depth; + + if (OP(scan) != SUSPEND) { /* GOSUB */ + /* Do setup, note this code has side effects beyond + * the rest of this block. Specifically setting + * RExC_recurse[] must happen at least once during + * study_chunk(). */ + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = REGNODE_p(RExC_open_parens[paren]); + end = REGNODE_p(RExC_close_parens[paren]); + + /* NOTE we MUST always execute the above code, even + * if we do nothing with a GOSUB */ + if ( + ( flags & SCF_IN_DEFINE ) + || + ( + (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) + && + ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) + ) + ) { + /* no need to do anything here if we are in a define. */ + /* or we are after some kind of infinite construct + * so we can skip recursing into this item. + * Since it is infinite we will not change the maxlen + * or delta, and if we miss something that might raise + * the minlen it will merely pessimise a little. + * + * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ + * might result in a minlen of 1 and not of 4, + * but this doesn't make us mismatch, just try a bit + * harder than we should. + * + * However we must assume this GOSUB is infinite, to + * avoid wrongly applying other optimizations in the + * enclosing scope - see GH 18096, for example. + */ + is_inf = is_inf_internal = 1; + scan= regnext(scan); + continue; + } + + if ( + !recursed_depth + || !PAREN_TEST(recursed_depth - 1, paren) + ) { + /* it is quite possible that there are more efficient ways + * to do this. We maintain a bitmap per level of recursion + * of which patterns we have entered so we can detect if a + * pattern creates a possible infinite loop. When we + * recurse down a level we copy the previous levels bitmap + * down. When we are at recursion level 0 we zero the top + * level bitmap. It would be nice to implement a different + * more efficient way of doing this. In particular the top + * level bitmap may be unnecessary. + */ + if (!recursed_depth) { + Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); + } else { + Copy(PAREN_OFFSET(recursed_depth - 1), + PAREN_OFFSET(recursed_depth), + RExC_study_chunk_recursed_bytes, U8); + } + /* we havent recursed into this paren yet, so recurse into it */ + DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta); + PAREN_SET(recursed_depth, paren); + my_recursed_depth= recursed_depth + 1; + } else { + DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta); + /* some form of infinite recursion, assume infinite length + * */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->cur_is_floating = 1; + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + + start= NULL; /* reset start so we dont recurse later on. */ + } + } else { + paren = stopparen; + start = scan + 2; + end = regnext(scan); + } + if (start) { + scan_frame *newframe; + assert(end); + if (!RExC_frame_last) { + Newxz(newframe, 1, scan_frame); + SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); + RExC_frame_head= newframe; + RExC_frame_count++; + } else if (!RExC_frame_last->next_frame) { + Newxz(newframe, 1, scan_frame); + RExC_frame_last->next_frame= newframe; + newframe->prev_frame= RExC_frame_last; + RExC_frame_count++; + } else { + newframe= RExC_frame_last->next_frame; + } + RExC_frame_last= newframe; + + newframe->next_regnode = regnext(scan); + newframe->last_regnode = last; + newframe->stopparen = stopparen; + newframe->prev_recursed_depth = recursed_depth; + newframe->this_prev_frame= frame; + newframe->in_gosub = ( + (frame && frame->in_gosub) || OP(scan) == GOSUB + ); + + DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta); + DEBUG_PEEP("fnew", scan, depth, flags); + + frame = newframe; + scan = start; + stopparen = paren; + last = end; + depth = depth + 1; + recursed_depth= my_recursed_depth; + + continue; + } + } + else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) { + SSize_t bytelen = STR_LEN(scan), charlen; + UV uc; + assert(bytelen); + if (UTF) { + const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + charlen = utf8_length(s, s + bytelen); + } else { + uc = *((U8*)STRING(scan)); + charlen = bytelen; + } + min += charlen; + if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ + /* The code below prefers earlier match for fixed + offset, later match for variable offset. */ + if (data->last_end == -1) { /* Update the start info. */ + data->last_start_min = data->pos_min; + data->last_start_max = + is_inf ? OPTIMIZE_INFTY + : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) + ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; + } + sv_catpvn(data->last_found, STRING(scan), bytelen); + if (UTF) + SvUTF8_on(data->last_found); + { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += charlen; + } + data->last_end = data->pos_min + charlen; + data->pos_min += charlen; /* As in the first entry. */ + data->flags &= ~SF_BEFORE_EOL; + } + + /* ANDing the code point leaves at most it, and not in locale, and + * can't match null string */ + if (flags & SCF_DO_STCLASS_AND) { + ssc_cp_and(data->start_class, uc); + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + ssc_clear_locale(data->start_class); + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_add_cp(data->start_class, uc); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta); + } + else if (REGNODE_TYPE(OP(scan)) == EXACT) { + /* But OP != EXACT!, so is EXACTFish */ + SSize_t bytelen = STR_LEN(scan), charlen; + const U8 * s = (U8*)STRING(scan); + + /* Replace a length 1 ASCII fold pair node with an ANYOFM node, + * with the mask set to the complement of the bit that differs + * between upper and lower case, and the lowest code point of the + * pair (which the '&' forces) */ + if ( bytelen == 1 + && isALPHA_A(*s) + && ( OP(scan) == EXACTFAA + || ( OP(scan) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) + && mutate_ok + ) { + U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ + + OP(scan) = ANYOFM; + ARG_SET(scan, *s & mask); + FLAGS(scan) = mask; + /* We're not EXACTFish any more, so restudy. + * Search for "restudy" in this file to find + * a comment with details. */ + continue; + } + + /* Search for fixed substrings supports EXACT only. */ + if (flags & SCF_DO_SUBSTR) { + assert(data); + scan_commit(pRExC_state, data, minlenp, is_inf); + } + charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; + if (unfolded_multi_char) { + RExC_seen |= REG_UNFOLDED_MULTI_SEEN; + } + min += charlen - min_subtract; + assert (min >= 0); + if ((SSize_t)min_subtract < OPTIMIZE_INFTY + && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract + ) { + delta += min_subtract; + } else { + delta = OPTIMIZE_INFTY; + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += charlen - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + if ((SSize_t)min_subtract < OPTIMIZE_INFTY + && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract + ) { + data->pos_delta += min_subtract; + } else { + data->pos_delta = OPTIMIZE_INFTY; + } + if (min_subtract) { + data->cur_is_floating = 1; /* float */ + } + } + + if (flags & SCF_DO_STCLASS) { + SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); + + assert(EXACTF_invlist); + if (flags & SCF_DO_STCLASS_AND) { + if (OP(scan) != EXACTFL) + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); + } + else { /* SCF_DO_STCLASS_OR */ + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); + } + DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta); + } + else if (REGNODE_VARIES(OP(scan))) { + SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; + I32 fl = 0; + U32 f = flags; + regnode * const oscan = scan; + regnode_ssc this_class; + regnode_ssc *oclass = NULL; + I32 next_is_eval = 0; + + switch (REGNODE_TYPE(OP(scan))) { + case WHILEM: /* End of (?:...)* . */ + scan = REGNODE_AFTER(scan); + goto finish; + case PLUS: + if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { + next = REGNODE_AFTER(scan); + if ( ( REGNODE_TYPE(OP(next)) == EXACT + && ! isEXACTFish(OP(next))) + || (flags & SCF_DO_STCLASS)) + { + mincount = 1; + maxcount = REG_INFTY; + next = regnext(scan); + scan = REGNODE_AFTER(scan); + goto do_curly; + } + } + if (flags & SCF_DO_SUBSTR) + data->pos_min++; + /* This will bypass the formal 'min += minnext * mincount' + * calculation in the do_curly path, so assumes min width + * of the PLUS payload is exactly one. */ + min++; + /* FALLTHROUGH */ + case STAR: + next = REGNODE_AFTER(scan); + + /* This temporary node can now be turned into EXACTFU, and + * must, as regexec.c doesn't handle it */ + if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { + OP(next) = EXACTFU; + } + + if ( STR_LEN(next) == 1 + && isALPHA_A(* STRING(next)) + && ( OP(next) == EXACTFAA + || ( OP(next) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) + && mutate_ok + ) { + /* These differ in just one bit */ + U8 mask = ~ ('A' ^ 'a'); + + assert(isALPHA_A(* STRING(next))); + + /* Then replace it by an ANYOFM node, with + * the mask set to the complement of the + * bit that differs between upper and lower + * case, and the lowest code point of the + * pair (which the '&' forces) */ + OP(next) = ANYOFM; + ARG_SET(next, *STRING(next) & mask); + FLAGS(next) = mask; + } + + if (flags & SCF_DO_STCLASS) { + mincount = 0; + maxcount = REG_INFTY; + next = regnext(scan); + scan = REGNODE_AFTER(scan); + goto do_curly; + } + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + data->cur_is_floating = 1; /* float */ + } + is_inf = is_inf_internal = 1; + scan = regnext(scan); + goto optimize_curly_tail; + case CURLY: + if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) + && (scan->flags == stopparen)) + { + mincount = 1; + maxcount = 1; + } else { + mincount = ARG1(scan); + maxcount = ARG2(scan); + } + next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); + } + scan = REGNODE_AFTER(scan); + next_is_eval = (OP(scan) == EVAL); + do_curly: + if (flags & SCF_DO_SUBSTR) { + if (mincount == 0) + scan_commit(pRExC_state, data, minlenp, is_inf); + /* Cannot extend fixed substrings */ + pos_before = data->pos_min; + } + if (data) { + fl = data->flags; + data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); + if (is_inf) + data->flags |= SF_IS_INF; + } + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + oclass = data->start_class; + data->start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + f &= ~SCF_DO_STCLASS_OR; + } + /* Exclude from super-linear cache processing any {n,m} + regops for which the combination of input pos and regex + pos is not enough information to determine if a match + will be possible. + + For example, in the regex /foo(bar\s*){4,8}baz/ with the + regex pos at the \s*, the prospects for a match depend not + only on the input position but also on how many (bar\s*) + repeats into the {4,8} we are. */ + if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; + + /* This will finish on WHILEM, setting scan, or on NULL: */ + /* recurse study_chunk() on loop bodies */ + minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, + last, data, stopparen, recursed_depth, NULL, + (mincount == 0 + ? (f & ~SCF_DO_SUBSTR) + : f) + , depth+1, mutate_ok); + + if (data && data->flags & SCF_SEEN_ACCEPT) { + if (mincount > 1) + mincount = 1; + } + + if (flags & SCF_DO_STCLASS) + data->start_class = oclass; + if (mincount == 0 || minnext == 0) { + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + } + else if (flags & SCF_DO_STCLASS_AND) { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&this_class, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; + } + } else { /* Non-zero len */ + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + } + else if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); + flags &= ~SCF_DO_STCLASS; + } + if (!scan) /* It was not CURLYX, but CURLY. */ + scan = next; + if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) + && (minnext == 0) && (deltanext == 0) + && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= REG_INFTY/3) /* Complement check for big + count */ + { + _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), + "Quantifier unexpected on zero-length expression " + "in regex m/%" UTF8f "/", + UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, + RExC_precomp))); + } + + if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) + || min >= SSize_t_MAX - minnext * mincount ) + { + FAIL("Regexp out of space"); + } + + min += minnext * mincount; + is_inf_internal |= deltanext == OPTIMIZE_INFTY + || (maxcount == REG_INFTY && minnext + deltanext > 0); + is_inf |= is_inf_internal; + if (is_inf) { + delta = OPTIMIZE_INFTY; + } else { + delta += (minnext + deltanext) * maxcount + - minnext * mincount; + } + + if (data && data->flags & SCF_SEEN_ACCEPT) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (stopmin > min) + stopmin = min; + DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta); + } + /* Try powerful optimization CURLYX => CURLYN. */ + if ( OP(oscan) == CURLYX && data + && data->flags & SF_IN_PAR + && !(data->flags & SF_HAS_EVAL) + && !deltanext && minnext == 1 + && mutate_ok + ) { + /* Try to optimize to CURLYN. */ + regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); + regnode * const nxt1 = nxt; +#ifdef DEBUGGING + regnode *nxt2; +#endif + + /* Skip open. */ + nxt = regnext(nxt); + if (!REGNODE_SIMPLE(OP(nxt)) + && !(REGNODE_TYPE(OP(nxt)) == EXACT + && STR_LEN(nxt) == 1)) + goto nogo; +#ifdef DEBUGGING + nxt2 = nxt; +#endif + nxt = regnext(nxt); + if (OP(nxt) != CLOSE) + goto nogo; + if (RExC_open_parens) { + + /*open->CURLYM*/ + RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); + + /*close->while*/ + RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2; + } + /* Now we know that nxt2 is the only contents: */ + oscan->flags = (U8)PARNO(nxt); + OP(oscan) = CURLYN; + OP(nxt1) = NOTHING; /* was OPEN. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ +#endif + } + nogo: + + /* Try optimization CURLYX => CURLYM. */ + if ( OP(oscan) == CURLYX && data + && !(data->flags & SF_HAS_PAR) + && !(data->flags & SF_HAS_EVAL) + && !deltanext /* atom is fixed width */ + && minnext != 0 /* CURLYM can't handle zero width */ + /* Nor characters whose fold at run-time may be + * multi-character */ + && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN) + && mutate_ok + ) { + /* XXXX How to optimize if data == 0? */ + /* Optimize to a simpler form. */ + regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */ + regnode *nxt2; + + OP(oscan) = CURLYM; + while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ + && (OP(nxt2) != WHILEM)) + nxt = nxt2; + OP(nxt2) = SUCCEED; /* Whas WHILEM */ + /* Need to optimize away parenths. */ + if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { + /* Set the parenth number. */ + /* note that we have changed the type of oscan to CURLYM here */ + regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/ + + oscan->flags = (U8)PARNO(nxt); + if (RExC_open_parens) { + /*open->CURLYM*/ + RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); + + /*close->NOTHING*/ + RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2) + + 1; + } + OP(nxt1) = OPTIMIZED; /* was OPEN. */ + OP(nxt) = OPTIMIZED; /* was CLOSE. */ + +#ifdef DEBUGGING + OP(nxt1 + 1) = OPTIMIZED; /* was count. */ + OP(nxt + 1) = OPTIMIZED; /* was count. */ + NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ + NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ +#endif +#if 0 + while ( nxt1 && (OP(nxt1) != WHILEM)) { + regnode *nnxt = regnext(nxt1); + if (nnxt == nxt) { + if (REGNODE_OFF_BY_ARG(OP(nxt1))) + ARG_SET(nxt1, nxt2 - nxt1); + else if (nxt2 - nxt1 < U16_MAX) + NEXT_OFF(nxt1) = nxt2 - nxt1; + else + OP(nxt) = NOTHING; /* Cannot beautify */ + } + nxt1 = nnxt; + } +#endif + /* Optimize again: */ + /* recurse study_chunk() on optimised CURLYX => CURLYM */ + study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, + NULL, stopparen, recursed_depth, NULL, 0, + depth+1, mutate_ok); + } + else + oscan->flags = 0; + } + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it's not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data) { + /* This stays as CURLYX, we can put the count/of pair. */ + /* Find WHILEM (as in regexec.c) */ + regnode *nxt = oscan + NEXT_OFF(oscan); + + if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */ + nxt += ARG(nxt); + nxt = REGNODE_BEFORE(nxt); + if (nxt->flags & 0xf) { + /* we've already set whilem count on this node */ + } else if (++data->whilem_c < 16) { + assert(data->whilem_c <= RExC_whilem_seen); + nxt->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } + } + if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (flags & SCF_DO_SUBSTR) { + SV *last_str = NULL; + STRLEN last_chrs = 0; + int counted = mincount != 0; + + if (data->last_end > 0 && mincount != 0) { /* Ends with a + string. */ + SSize_t b = pos_before >= data->last_start_min + ? pos_before : data->last_start_min; + STRLEN l; + const char * const s = SvPV_const(data->last_found, l); + SSize_t old = b - data->last_start_min; + assert(old >= 0); + + if (UTF) + old = utf8_hop_forward((U8*)s, old, + (U8 *) SvEND(data->last_found)) + - (U8*)s; + l -= old; + /* Get the added string: */ + last_str = newSVpvn_utf8(s + old, l, UTF); + last_chrs = UTF ? utf8_length((U8*)(s + old), + (U8*)(s + old + l)) : l; + if (deltanext == 0 && pos_before == b) { + /* What was added is a constant string */ + if (mincount > 1) { + + SvGROW(last_str, (mincount * l) + 1); + repeatcpy(SvPVX(last_str) + l, + SvPVX_const(last_str), l, + mincount - 1); + SvCUR_set(last_str, SvCUR(last_str) * mincount); + /* Add additional parts. */ + SvCUR_set(data->last_found, + SvCUR(data->last_found) - l); + sv_catsv(data->last_found, last_str); + { + SV * sv = data->last_found; + MAGIC *mg = + SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + if (mg && mg->mg_len >= 0) + mg->mg_len += last_chrs * (mincount-1); + } + last_chrs *= mincount; + data->last_end += l * (mincount - 1); + } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max = + is_inf + ? OPTIMIZE_INFTY + : data->last_start_max + + (maxcount - 1) * (minnext + data->pos_delta); + } + } + /* It is counted once already... */ + data->pos_min += minnext * (mincount - counted); +#if 0 + Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf + " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf + " maxcount=%" UVuf " mincount=%" UVuf + " data->pos_delta=%" UVuf "\n", + (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, + (UV)maxcount, (UV)mincount, (UV)data->pos_delta); + if (deltanext != OPTIMIZE_INFTY) + Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", + (UV)(-counted * deltanext + (minnext + deltanext) * maxcount + - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta)); +#endif + if (deltanext == OPTIMIZE_INFTY + || data->pos_delta == OPTIMIZE_INFTY + || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta) + data->pos_delta = OPTIMIZE_INFTY; + else + data->pos_delta += - counted * deltanext + + (minnext + deltanext) * maxcount - minnext * mincount; + if (mincount != maxcount) { + /* Cannot extend fixed substrings found inside + the group. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + if (mincount && last_str) { + SV * const sv = data->last_found; + MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? + mg_find(sv, PERL_MAGIC_utf8) : NULL; + + if (mg) + mg->mg_len = -1; + sv_setsv(sv, last_str); + data->last_end = data->pos_min; + data->last_start_min = data->pos_min - last_chrs; + data->last_start_max = is_inf + ? OPTIMIZE_INFTY + : data->pos_min + data->pos_delta - last_chrs; + } + data->cur_is_floating = 1; /* float */ + } + SvREFCNT_dec(last_str); + } + if (data && (fl & SF_HAS_EVAL)) + data->flags |= SF_HAS_EVAL; + optimize_curly_tail: + rck_elide_nothing(oscan); + continue; + + default: + Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", + OP(scan)); + case REF: + case CLUMP: + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->cur_is_floating = 1; /* float */ + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) { + if (OP(scan) == CLUMP) { + /* Actually is any start char, but very few code points + * aren't start characters */ + ssc_match_all_cp(data->start_class); + } + else { + ssc_anything(data->start_class); + } + } + flags &= ~SCF_DO_STCLASS; + break; + } + } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + PL_XPosix_ptrs[CC_VERTSPACE_], FALSE); + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; + } + else if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + PL_XPosix_ptrs[CC_VERTSPACE_], + FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + + /* See commit msg for + * 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; + } + flags &= ~SCF_DO_STCLASS; + } + min++; + if (delta != OPTIMIZE_INFTY) + delta++; /* Because of the 2 char string cr-lf */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += 1; + if (data->pos_delta != OPTIMIZE_INFTY) { + data->pos_delta += 1; + } + data->cur_is_floating = 1; /* float */ + } + } + else if (REGNODE_SIMPLE(OP(scan))) { + + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min++; + } + min++; + if (flags & SCF_DO_STCLASS) { + bool invert = 0; + SV* my_invlist = NULL; + U8 namedclass; + + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + + /* Some of the logic below assumes that switching + locale on will only add false positives. */ + switch (OP(scan)) { + + default: +#ifdef DEBUGGING + Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", + OP(scan)); +#endif + case SANY: + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_match_all_cp(data->start_class); + break; + + case REG_ANY: + { + SV* REG_ANY_invlist = _new_invlist(2); + REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, + '\n'); + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert, hence all but \n + */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + REG_ANY_invlist, + TRUE /* TRUE => invert */ + ); + ssc_clear_locale(data->start_class); + } + SvREFCNT_dec_NN(REG_ANY_invlist); + } + break; + + case ANYOFD: + case ANYOFL: + case ANYOFPOSIXL: + case ANYOFH: + case ANYOFHb: + case ANYOFHr: + case ANYOFHs: + case ANYOF: + if (flags & SCF_DO_STCLASS_AND) + ssc_and(pRExC_state, data->start_class, + (regnode_charclass *) scan); + else + ssc_or(pRExC_state, data->start_class, + (regnode_charclass *) scan); + break; + + case ANYOFHbbm: + { + SV* cp_list = get_ANYOFHbbm_contents(scan); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, cp_list, invert); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, cp_list, invert); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + + case NANYOFM: /* NANYOFM already contains the inversion of the + input ANYOF data, so, unlike things like + NPOSIXA, don't change 'invert' to TRUE */ + /* FALLTHROUGH */ + case ANYOFM: + { + SV* cp_list = get_ANYOFM_contents(scan); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, cp_list, invert); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, cp_list, invert); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + + case ANYOFR: + case ANYOFRb: + { + SV* cp_list = NULL; + + cp_list = _add_range_to_invlist(cp_list, + ANYOFRbase(scan), + ANYOFRbase(scan) + ANYOFRdelta(scan)); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, cp_list, invert); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, cp_list, invert); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + + case NPOSIXL: + invert = 1; + /* FALLTHROUGH */ + + case POSIXL: + namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; + if (flags & SCF_DO_STCLASS_AND) { + bool was_there = cBOOL( + ANYOF_POSIXL_TEST(data->start_class, + namedclass)); + ANYOF_POSIXL_ZERO(data->start_class); + if (was_there) { /* Do an AND */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + /* No individual code points can now match */ + data->start_class->invlist + = sv_2mortal(_new_invlist(0)); + } + else { + int complement = namedclass + ((invert) ? -1 : 1); + + assert(flags & SCF_DO_STCLASS_OR); + + /* If the complement of this class was already there, + * the result is that they match all code points, + * (\d + \D == everything). Remove the classes from + * future consideration. Locale is not relevant in + * this case */ + if (ANYOF_POSIXL_TEST(data->start_class, complement)) { + ssc_match_all_cp(data->start_class); + ANYOF_POSIXL_CLEAR(data->start_class, namedclass); + ANYOF_POSIXL_CLEAR(data->start_class, complement); + } + else { /* The usual case; just add this class to the + existing set */ + ANYOF_POSIXL_SET(data->start_class, namedclass); + } + } + break; + + case NPOSIXA: /* For these, we always know the exact set of + what's matched */ + invert = 1; + /* FALLTHROUGH */ + case POSIXA: + my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); + goto join_posix_and_ascii; + + case NPOSIXD: + case NPOSIXU: + invert = 1; + /* FALLTHROUGH */ + case POSIXD: + case POSIXU: + my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); + + /* NPOSIXD matches all upper Latin1 code points unless the + * target string being matched is UTF-8, which is + * unknowable until match time. Since we are going to + * invert, we want to get rid of all of them so that the + * inversion will match all */ + if (OP(scan) == NPOSIXD) { + _invlist_subtract(my_invlist, PL_UpperLatin1, + &my_invlist); + } + + join_posix_and_ascii: + + if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, my_invlist, invert); + ssc_clear_locale(data->start_class); + } + else { + assert(flags & SCF_DO_STCLASS_OR); + ssc_union(data->start_class, my_invlist, invert); + } + SvREFCNT_dec(my_invlist); + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) { + data->flags |= (OP(scan) == MEOL + ? SF_BEFORE_MEOL + : SF_BEFORE_SEOL); + scan_commit(pRExC_state, data, minlenp, is_inf); + + } + else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ + /* Lookbehind, or need to calculate parens/evals/stclass: */ + && (scan->flags || data || (flags & SCF_DO_STCLASS)) + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) + { + if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY + || OP(scan) == UNLESSM ) + { + /* Negative Lookahead/lookbehind + In this case we can't do fixed string optimisation. + */ + + bool is_positive = OP(scan) == IFMATCH ? 1 : 0; + SSize_t deltanext, minnext; + SSize_t fake_last_close = 0; + regnode *fake_last_close_op = NULL; + regnode *cur_last_close_op; + regnode *nscan; + regnode_ssc intrnl; + U32 f = (flags & SCF_TRIE_DOING_RESTUDY); + + StructCopy(&zero_scan_data, &data_fake, scan_data_t); + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + data_fake.last_close_opp = data->last_close_opp; + } + else { + data_fake.last_closep = &fake_last_close; + data_fake.last_close_opp = &fake_last_close_op; + } + + /* remember the last_close_op we saw so we can see if + * we are dealing with variable length lookbehind that + * contains capturing buffers, which are considered + * experimental */ + cur_last_close_op= *(data_fake.last_close_opp); + + data_fake.pos_delta = delta; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = REGNODE_AFTER(scan); + + /* recurse study_chunk() for lookahead body */ + minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, + last, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1, + mutate_ok); + + if (scan->flags) { + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || minnext > (I32)U8_MAX + || minnext + deltanext > (I32)U8_MAX) + { + FAIL2("Lookbehind longer than %" UVuf " not implemented", + (UV)U8_MAX); + } + + /* The 'next_off' field has been repurposed to count the + * additional starting positions to try beyond the initial + * one. (This leaves it at 0 for non-variable length + * matches to avoid breakage for those not using this + * extension) */ + if (deltanext) { + scan->next_off = deltanext; + if ( + /* See a CLOSE op inside this lookbehind? */ + cur_last_close_op != *(data_fake.last_close_opp) + /* and not doing restudy. see: restudied */ + && !(flags & SCF_TRIE_DOING_RESTUDY) + ) { + /* this is positive variable length lookbehind with + * capture buffers inside of it */ + ckWARNexperimental_with_arg(RExC_parse, + WARN_EXPERIMENTAL__VLB, + "Variable length %s lookbehind with capturing is experimental", + is_positive ? "positive" : "negative"); + } + } + scan->flags = (U8)minnext + deltanext; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (f & SCF_DO_STCLASS_AND) { + if (flags & SCF_DO_STCLASS_OR) { + /* OR before, AND after: ideally we would recurse with + * data_fake to get the AND applied by study of the + * remainder of the pattern, and then derecurse; + * *** HACK *** for now just treat as "no information". + * See [perl #56690]. + */ + ssc_init(pRExC_state, data->start_class); + } else { + /* AND before and after: combine and continue. These + * assertions are zero-length, so can match an EMPTY + * string */ + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; + } + } + DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta); + } +#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY + else { + /* Positive Lookahead/lookbehind + In this case we can do fixed string optimisation, + but we must be careful about it. Note in the case of + lookbehind the positions will be offset by the minimum + length of the pattern, something we won't know about + until after the recurse. + */ + SSize_t deltanext, fake_last_close = 0; + regnode *last_close_op = NULL; + regnode *nscan; + regnode_ssc intrnl; + U32 f = (flags & SCF_TRIE_DOING_RESTUDY); + /* We use SAVEFREEPV so that when the full compile + is finished perl will clean up the allocated + minlens when it's all done. This way we don't + have to worry about freeing them when we know + they wont be used, which would be a pain. + */ + SSize_t *minnextp; + Newx( minnextp, 1, SSize_t ); + SAVEFREEPV(minnextp); + + if (data) { + StructCopy(data, &data_fake, scan_data_t); + if ((flags & SCF_DO_SUBSTR) && data->last_found) { + f |= SCF_DO_SUBSTR; + if (scan->flags) + scan_commit(pRExC_state, &data_fake, minlenp, is_inf); + data_fake.last_found=newSVsv(data->last_found); + } + } + else { + data_fake.last_closep = &fake_last_close; + data_fake.last_close_opp = &fake_last_close_opp; + } + data_fake.flags = 0; + data_fake.substrs[0].flags = 0; + data_fake.substrs[1].flags = 0; + data_fake.pos_delta = delta; + if (is_inf) + data_fake.flags |= SF_IS_INF; + if ( flags & SCF_DO_STCLASS && !scan->flags + && OP(scan) == IFMATCH ) { /* Lookahead */ + ssc_init(pRExC_state, &intrnl); + data_fake.start_class = &intrnl; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + next = regnext(scan); + nscan = REGNODE_AFTER(scan); + + /* positive lookahead study_chunk() recursion */ + *minnextp = study_chunk(pRExC_state, &nscan, minnextp, + &deltanext, last, &data_fake, + stopparen, recursed_depth, NULL, + f, depth+1, mutate_ok); + if (scan->flags) { + assert(0); /* This code has never been tested since this + is normally not compiled */ + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || *minnextp > (I32)U8_MAX + || *minnextp + deltanext > (I32)U8_MAX) + { + FAIL2("Lookbehind longer than %" UVuf " not implemented", + (UV)U8_MAX); + } + + if (deltanext) { + scan->next_off = deltanext; + } + scan->flags = (U8)*minnextp + deltanext; + } + + *minnextp += min; + + if (f & SCF_DO_STCLASS_AND) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); + ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; + } + if (data) { + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { + int i; + if (RExC_rx->minlen < *minnextp) + RExC_rx->minlen = *minnextp; + scan_commit(pRExC_state, &data_fake, minnextp, is_inf); + SvREFCNT_dec_NN(data_fake.last_found); + + for (i = 0; i < 2; i++) { + if (data_fake.substrs[i].minlenp != minlenp) { + data->substrs[i].min_offset = + data_fake.substrs[i].min_offset; + data->substrs[i].max_offset = + data_fake.substrs[i].max_offset; + data->substrs[i].minlenp = + data_fake.substrs[i].minlenp; + data->substrs[i].lookbehind += scan->flags; + } + } + } + } + } +#endif + } + else if (OP(scan) == OPEN) { + if (stopparen != (I32)PARNO(scan)) + pars++; + } + else if (OP(scan) == CLOSE) { + if (stopparen == (I32)PARNO(scan)) { + break; + } + if ((I32)PARNO(scan) == is_par) { + next = regnext(scan); + + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) { + *(data->last_closep) = PARNO(scan); + *(data->last_close_opp) = scan; + } + } + else if (OP(scan) == EVAL) { + if (data) + data->flags |= SF_HAS_EVAL; + } + else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + if (OP(scan)==ACCEPT) { + /* m{(*ACCEPT)x} does not have to start with 'x' */ + flags &= ~SCF_DO_STCLASS; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + if (stopmin > min) + stopmin = min; + } + } + else if (OP(scan) == COMMIT) { + /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we + * must not end up with "abcxyz" as a fixed substring else we'll + * skip straight to attempting to match at offset 4. + */ + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + flags &= ~SCF_DO_SUBSTR; + } + } + else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ + { + if (flags & SCF_DO_SUBSTR) { + scan_commit(pRExC_state, data, minlenp, is_inf); + data->cur_is_floating = 1; /* float */ + } + is_inf = is_inf_internal = 1; + if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ + ssc_anything(data->start_class); + flags &= ~SCF_DO_STCLASS; + } + else if (OP(scan) == GPOS) { + if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && + !(delta || is_inf || (data && data->pos_delta))) + { + if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) + RExC_rx->intflags |= PREGf_ANCH_GPOS; + if (RExC_rx->gofs < (STRLEN)min) + RExC_rx->gofs = min; + } else { + RExC_rx->intflags |= PREGf_GPOS_FLOAT; + RExC_rx->gofs = 0; + } + } +#ifdef TRIE_STUDY_OPT +#ifdef FULL_TRIE_STUDY + else if (REGNODE_TYPE(OP(scan)) == TRIE) { + /* NOTE - There is similar code to this block above for handling + BRANCH nodes on the initial study. If you change stuff here + check there too. */ + regnode *trie_node= scan; + regnode *tail= regnext(scan); + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + SSize_t max1 = 0, min1 = OPTIMIZE_INFTY; + regnode_ssc accum; + + if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ + /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data, minlenp, is_inf); + } + if (flags & SCF_DO_STCLASS) + ssc_init_zero(pRExC_state, &accum); + + if (!trie->jump) { + min1= trie->minlen; + max1= trie->maxlen; + } else { + const regnode *nextbranch= NULL; + U32 word; + + for ( word=1 ; word <= trie->wordcount ; word++) + { + SSize_t deltanext = 0, minnext = 0; + U32 f = (flags & SCF_TRIE_DOING_RESTUDY); + SSize_t fake_last_close = 0; + regnode *fake_last_close_op = NULL; + regnode_ssc this_class; + + StructCopy(&zero_scan_data, &data_fake, scan_data_t); + if (data) { + data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + data_fake.last_close_opp = data->last_close_opp; + } + else { + data_fake.last_closep = &fake_last_close; + data_fake.last_close_opp = &fake_last_close_op; + } + data_fake.pos_delta = delta; + if (flags & SCF_DO_STCLASS) { + ssc_init(pRExC_state, &this_class); + data_fake.start_class = &this_class; + f |= SCF_DO_STCLASS_AND; + } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; + + if (trie->jump[word]) { + if (!nextbranch) + nextbranch = trie_node + trie->jump[0]; + scan= trie_node + trie->jump[word]; + /* We go from the jump point to the branch that follows + it. Note this means we need the vestigal unused + branches even though they arent otherwise used. */ + /* optimise study_chunk() for TRIE */ + minnext = study_chunk(pRExC_state, &scan, minlenp, + &deltanext, (regnode *)nextbranch, &data_fake, + stopparen, recursed_depth, NULL, f, depth+1, + mutate_ok); + } + if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) + nextbranch= regnext((regnode*)nextbranch); + + if (min1 > (SSize_t)(minnext + trie->minlen)) + min1 = minnext + trie->minlen; + if (deltanext == OPTIMIZE_INFTY) { + is_inf = is_inf_internal = 1; + max1 = OPTIMIZE_INFTY; + } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) + max1 = minnext + deltanext + trie->maxlen; + + if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) + pars++; + if (data_fake.flags & SCF_SEEN_ACCEPT) { + if ( stopmin > min + min1) + stopmin = min + min1; + flags &= ~SCF_DO_SUBSTR; + if (data) + data->flags |= SCF_SEEN_ACCEPT; + } + if (data) { + if (data_fake.flags & SF_HAS_EVAL) + data->flags |= SF_HAS_EVAL; + data->whilem_c = data_fake.whilem_c; + } + if (flags & SCF_DO_STCLASS) + ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); + } + DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta); + } + if (flags & SCF_DO_SUBSTR) { + data->pos_min += min1; + data->pos_delta += max1 - min1; + if (max1 != min1 || is_inf) + data->cur_is_floating = 1; /* float */ + } + min += min1; + if (delta != OPTIMIZE_INFTY) { + if (OPTIMIZE_INFTY - (max1 - min1) >= delta) + delta += max1 - min1; + else + delta = OPTIMIZE_INFTY; + } + if (flags & SCF_DO_STCLASS_OR) { + ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + flags &= ~SCF_DO_STCLASS; + } + } + else if (flags & SCF_DO_STCLASS_AND) { + if (min1) { + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); + flags &= ~SCF_DO_STCLASS; + } + else { + /* Switch to OR mode: cache the old value of + * data->start_class */ + INIT_AND_WITHP; + StructCopy(data->start_class, and_withp, regnode_ssc); + flags &= ~SCF_DO_STCLASS_AND; + StructCopy(&accum, data->start_class, regnode_ssc); + flags |= SCF_DO_STCLASS_OR; + } + } + scan= tail; + DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta); + continue; + } +#else + else if (REGNODE_TYPE(OP(scan)) == TRIE) { + reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; + U8*bang=NULL; + + min += trie->minlen; + delta += (trie->maxlen - trie->minlen); + flags &= ~SCF_DO_STCLASS; /* xxx */ + if (flags & SCF_DO_SUBSTR) { + /* Cannot expect anything... */ + scan_commit(pRExC_state, data, minlenp, is_inf); + data->pos_min += trie->minlen; + data->pos_delta += (trie->maxlen - trie->minlen); + if (trie->maxlen != trie->minlen) + data->cur_is_floating = 1; /* float */ + } + if (trie->jump) /* no more substrings -- for now /grr*/ + flags &= ~SCF_DO_SUBSTR; + } + +#endif /* old or new */ +#endif /* TRIE_STUDY_OPT */ + + else if (OP(scan) == REGEX_SET) { + Perl_croak(aTHX_ "panic: %s regnode should be resolved" + " before optimization", REGNODE_NAME(REGEX_SET)); + } + + /* Else: zero-length, ignore. */ + scan = regnext(scan); + } + + finish: + if (frame) { + /* we need to unwind recursion. */ + depth = depth - 1; + + DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta); + DEBUG_PEEP("fend", scan, depth, flags); + + /* restore previous context */ + last = frame->last_regnode; + scan = frame->next_regnode; + stopparen = frame->stopparen; + recursed_depth = frame->prev_recursed_depth; + + RExC_frame_last = frame->prev_frame; + frame = frame->this_prev_frame; + goto fake_study_recurse; + } + + assert(!frame); + DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta); + + /* is this pattern infinite? Eg, consider /(a|b+)/ */ + if (is_inf_internal) + delta = OPTIMIZE_INFTY; + + /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */ + if (min > stopmin) { + /* + At this point 'min' represents the minimum length string we can + match while *ignoring* the implication of ACCEPT, and 'delta' + represents the difference between the minimum length and maximum + length, and if the pattern matches an infinitely long string + (consider the + and * quantifiers) then we use the special delta + value of OPTIMIZE_INFTY to represent it. 'stopmin' is the + minimum length that can be matched *and* accepted. + + A pattern is accepted when matching was successful *and* + complete, and thus there is no further matching needing to be + done, no backtracking to occur, etc. Prior to the introduction + of ACCEPT the only opcode that signaled acceptance was the END + opcode, which is always the very last opcode in a regex program. + ACCEPT is thus conceptually an early successful return out of + the matching process. stopmin starts out as OPTIMIZE_INFTY to + represent "the entire pattern", and is ratched down to the + "current min" if necessary when an ACCEPT opcode is encountered. + + Thus stopmin might be smaller than min if we saw an (*ACCEPT), + and we now need to account for it in both min and delta. + Consider that in a pattern /AB/ normally the min length it can + match can be computed as min(A)+min(B). But (*ACCEPT) means + that it might be something else, not even neccesarily min(A) at + all. Consider + + A = /(foo(*ACCEPT)|x+)/ + B = /whop/ + AB = /(foo(*ACCEPT)|x+)whop/ + + The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY + for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for + "whop", and the delta of 0 as the pattern is of fixed length, the + stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT. + When handling AB we expect to see a min of 5 for "xwhop", and a + delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3 + for "foo". This should result in a final min of 3 for "foo", and + a final delta of OPTIMIZE_INFTY for "xxxxx...whop". + + In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a + min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the + stop min would be 4 for "dude". This should result in a final + min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx". + + When min is smaller than stopmin then we can ignore it. In the + fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2, + and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously + the ACCEPT doesn't reduce the minimum length of the string that + might be matched, nor affect the maximum length. + + In something like /foo(*ACCEPT)ba?r/ we would have a min of 5 + for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for + "foo". We currently turn this into a min of 3 for "foo" and a + delta of 3 for "foobar" even though technically "foobar" isn't + possible. ACCEPT affects some aspects of the optimizer, like + length computations and mandatory substring optimizations, but + there are other optimzations this routine perfoms that are not + affected and this compromise simplifies implementation. + + It might be helpful to consider that this C function is called + recursively on the pattern in a bottom up fashion, and that the + min returned by a nested call may be marked as coming from an + ACCEPT, causing its callers to treat the returned min as a + stopmin as the recursion unwinds. Thus a single ACCEPT can affect + multiple calls into this function in different ways. + */ + + if (OPTIMIZE_INFTY - delta >= min - stopmin) + delta += min - stopmin; + else + delta = OPTIMIZE_INFTY; + min = stopmin; + } + + *scanp = scan; + *deltap = delta; + + if (flags & SCF_DO_SUBSTR && is_inf) + data->pos_delta = OPTIMIZE_INFTY - data->pos_min; + if (is_par > (I32)U8_MAX) + is_par = 0; + if (is_par && pars==1 && data) { + data->flags |= SF_IN_PAR; + data->flags &= ~SF_HAS_PAR; + } + else if (pars && data) { + data->flags |= SF_HAS_PAR; + data->flags &= ~SF_IN_PAR; + } + if (flags & SCF_DO_STCLASS_OR) + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); + if (flags & SCF_TRIE_RESTUDY) + data->flags |= SCF_TRIE_RESTUDY; + + + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { + if (min > OPTIMIZE_INFTY - delta) + RExC_maxlen = OPTIMIZE_INFTY; + else if (RExC_maxlen < min + delta) + RExC_maxlen = min + delta; + } + DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta); + return min; +} diff --git a/regcomp_trie.c b/regcomp_trie.c new file mode 100644 index 0000000000..65963cacf2 --- /dev/null +++ b/regcomp_trie.c @@ -0,0 +1,1688 @@ +#ifdef PERL_EXT_RE_BUILD +#include "re_top.h" +#endif + +#include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE +#define PERL_IN_REGCOMP_ANY +#define PERL_IN_REGCOMP_TRIE_C +#include "perl.h" + +#ifdef PERL_IN_XSUB_RE +# include "re_comp.h" +#else +# include "regcomp.h" +#endif + +#include "invlist_inline.h" +#include "unicode_constants.h" +#include "regcomp_internal.h" + +#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] +#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) +#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) +#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ + ? (TRIE_LIST_CUR( idx ) - 1) \ + : 0 ) + + +#ifdef DEBUGGING +/* + dump_trie(trie,widecharmap,revcharmap) + dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) + dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) + + These routines dump out a trie in a somewhat readable format. + The _interim_ variants are used for debugging the interim + tables that are used to generate the final compressed + representation which is what dump_trie expects. + + Part of the reason for their existence is to provide a form + of documentation as to how the different representations function. + +*/ + +/* + Dumps the final compressed table form of the trie to Perl_debug_log. + Used for debugging make_trie(). +*/ + +STATIC void +S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, + AV *revcharmap, U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + U16 word; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_DUMP_TRIE; + + Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", + depth+1, "Match","Base","Ofs" ); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) { + SV ** const tmp = av_fetch_simple( revcharmap, state, 0); + if ( tmp ) { + Perl_re_printf( aTHX_ "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); + + for( state = 0 ; state < trie->uniquecharcount ; state++ ) + Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); + Perl_re_printf( aTHX_ "\n"); + + for( state = 1 ; state < trie->statecount ; state++ ) { + const U32 base = trie->states[ state ].trans.base; + + Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); + + if ( trie->states[ state ].wordnum ) { + Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); + } else { + Perl_re_printf( aTHX_ "%6s", "" ); + } + + Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); + + if ( base ) { + U32 ofs = 0; + + while( ( base + ofs < trie->uniquecharcount ) || + ( base + ofs - trie->uniquecharcount < trie->lasttrans + && trie->trans[ base + ofs - trie->uniquecharcount ].check + != state)) + ofs++; + + Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) + && ( base + ofs - trie->uniquecharcount + < trie->lasttrans ) + && trie->trans[ base + ofs + - trie->uniquecharcount ].check == state ) + { + Perl_re_printf( aTHX_ "%*" UVXf, colwidth, + (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next + ); + } else { + Perl_re_printf( aTHX_ "%*s", colwidth," ." ); + } + } + + Perl_re_printf( aTHX_ "]"); + + } + Perl_re_printf( aTHX_ "\n" ); + } + Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", + depth); + for (word=1; word <= trie->wordcount; word++) { + Perl_re_printf( aTHX_ " %d:(%d,%d)", + (int)word, (int)(trie->wordinfo[word].prev), + (int)(trie->wordinfo[word].len)); + } + Perl_re_printf( aTHX_ "\n" ); +} +/* + Dumps a fully constructed but uncompressed trie in list form. + List tries normally only are used for construction when the number of + possible chars (trie->uniquecharcount) is very high. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; + + /* print out the table precompression. */ + Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", + depth+1 ); + Perl_re_indentf( aTHX_ "%s", + depth+1, "------:-----+-----------------\n" ); + + for( state=1 ; state < next_alloc ; state ++ ) { + U16 charid; + + Perl_re_indentf( aTHX_ " %4" UVXf " :", + depth+1, (UV)state ); + if ( ! trie->states[ state ].wordnum ) { + Perl_re_printf( aTHX_ "%5s| ",""); + } else { + Perl_re_printf( aTHX_ "W%4x| ", + trie->states[ state ].wordnum + ); + } + for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { + SV ** const tmp = av_fetch_simple( revcharmap, + TRIE_LIST_ITEM(state, charid).forid, 0); + if ( tmp ) { + Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), + colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + | PERL_PV_ESCAPE_FIRSTCHAR + ) , + TRIE_LIST_ITEM(state, charid).forid, + (UV)TRIE_LIST_ITEM(state, charid).newstate + ); + if (!(charid % 10)) + Perl_re_printf( aTHX_ "\n%*s| ", + (int)((depth * 2) + 14), ""); + } + } + Perl_re_printf( aTHX_ "\n"); + } +} + +/* + Dumps a fully constructed but uncompressed trie in table form. + This is the normal DFA style state transition table, with a few + twists to facilitate compression later. + Used for debugging make_trie(). +*/ +STATIC void +S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, + HV *widecharmap, AV *revcharmap, U32 next_alloc, + U32 depth) +{ + U32 state; + U16 charid; + SV *sv=sv_newmortal(); + int colwidth= widecharmap ? 6 : 4; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; + + /* + print out the table precompression so that we can do a visual check + that they are identical. + */ + + Perl_re_indentf( aTHX_ "Char : ", depth+1 ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + SV ** const tmp = av_fetch_simple( revcharmap, charid, 0); + if ( tmp ) { + Perl_re_printf( aTHX_ "%*s", + colwidth, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + } + } + + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State+-", depth+1 ); + + for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { + Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); + } + + Perl_re_printf( aTHX_ "\n" ); + + for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { + + Perl_re_indentf( aTHX_ "%4" UVXf " : ", + depth+1, + (UV)TRIE_NODENUM( state ) ); + + for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { + UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); + if (v) + Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); + else + Perl_re_printf( aTHX_ "%*s", colwidth, "." ); + } + if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { + Perl_re_printf( aTHX_ " (%4" UVXf ")\n", + (UV)trie->trans[ state ].check ); + } else { + Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", + (UV)trie->trans[ state ].check, + trie->states[ TRIE_NODENUM( state ) ].wordnum ); + } + } +} + +#endif + + +/* make_trie(startbranch,first,last,tail,word_count,flags,depth) + startbranch: the first branch in the whole branch sequence + first : start branch of sequence of branch-exact nodes. + May be the same as startbranch + last : Thing following the last branch. + May be the same as tail. + tail : item following the branch sequence + count : words in the sequence + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ + depth : indent depth + +Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. + +A trie is an N'ary tree where the branches are determined by digital +decomposition of the key. IE, at the root node you look up the 1st character and +follow that branch repeat until you find the end of the branches. Nodes can be +marked as "accepting" meaning they represent a complete word. Eg: + + /he|she|his|hers/ + +would convert into the following structure. Numbers represent states, letters +following numbers represent valid transitions on the letter from that state, if +the number is in square brackets it represents an accepting state, otherwise it +will be in parenthesis. + + +-h->+-e->[3]-+-r->(8)-+-s->[9] + | | + | (2) + | | + (1) +-i->(6)-+-s->[7] + | + +-s->(3)-+-h->(4)-+-e->[5] + + Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) + +This shows that when matching against the string 'hers' we will begin at state 1 +read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, +then read 'r' and go to state 8 followed by 's' which takes us to state 9 which +is also accepting. Thus we know that we can match both 'he' and 'hers' with a +single traverse. We store a mapping from accepting to state to which word was +matched, and then when we have multiple possibilities we try to complete the +rest of the regex in the order in which they occurred in the alternation. + +The only prior NFA like behaviour that would be changed by the TRIE support is +the silent ignoring of duplicate alternations which are of the form: + + / (DUPE|DUPE) X? (?{ ... }) Y /x + +Thus EVAL blocks following a trie may be called a different number of times with +and without the optimisation. With the optimisations dupes will be silently +ignored. This inconsistent behaviour of EVAL type nodes is well established as +the following demonstrates: + + 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ + +which prints out 'word' three times, but + + 'words'=~/(word|word|word)(?{ print $1 })S/ + +which doesnt print it out at all. This is due to other optimisations kicking in. + +Example of what happens on a structural level: + +The regexp /(ac|ad|ab)+/ will produce the following debug output: + + 1: CURLYM[1] {1,32767}(18) + 5: BRANCH(8) + 6: EXACT <ac>(16) + 8: BRANCH(11) + 9: EXACT <ad>(16) + 11: BRANCH(14) + 12: EXACT <ab>(16) + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +This would be optimizable with startbranch=5, first=5, last=16, tail=16 +and should turn into: + + 1: CURLYM[1] {1,32767}(18) + 5: TRIE(16) + [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] + <ac> + <ad> + <ab> + 16: SUCCEED(0) + 17: NOTHING(18) + 18: END(0) + +Cases where tail != last would be like /(?foo|bar)baz/: + + 1: BRANCH(4) + 2: EXACT <foo>(8) + 4: BRANCH(7) + 5: EXACT <bar>(8) + 7: TAIL(8) + 8: EXACT <baz>(10) + 10: END(0) + +which would be optimizable with startbranch=1, first=1, last=7, tail=8 +and would end up looking like: + + 1: TRIE(8) + [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] + <foo> + <bar> + 7: TAIL(8) + 8: EXACT <baz>(10) + 10: END(0) + + d = uvchr_to_utf8_flags(d, uv, 0); + +is the recommended Unicode-aware way of saying + + *(d++) = uv; +*/ + +#define TRIE_STORE_REVCHAR(val) \ + STMT_START { \ + if (UTF) { \ + SV *zlopp = newSV(UTF8_MAXBYTES); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ + *kapow = '\0'; \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push_simple(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)val; \ + av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END + +/* This gets the next character from the input, folding it if not already + * folded. */ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need \ + * folding */ \ + uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* This folder implies Unicode rules, which in the range expressible \ + * by not UTF is the lower case, with the two exceptions, one of \ + * which should have been taken care of before calling this */ \ + assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ + uvc = toLOWER_L1(*uc); \ + if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ + len = 1; \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ +} STMT_END + + + +#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ + if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ + U32 ging = TRIE_LIST_LEN( state ) * 2; \ + Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ + TRIE_LIST_LEN( state ) = ging; \ + } \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ + TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ + TRIE_LIST_CUR( state )++; \ +} STMT_END + +#define TRIE_LIST_NEW(state) STMT_START { \ + Newx( trie->states[ state ].trans.list, \ + 4, reg_trie_trans_le ); \ + TRIE_LIST_CUR( state ) = 1; \ + TRIE_LIST_LEN( state ) = 4; \ +} STMT_END + +#define TRIE_HANDLE_WORD(state) STMT_START { \ + U16 dupe= trie->states[ state ].wordnum; \ + regnode * const noper_next = regnext( noper ); \ + \ + DEBUG_r({ \ + /* store the word for dumping */ \ + SV* tmp; \ + if (OP(noper) != NOTHING) \ + tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ + else \ + tmp = newSVpvn_utf8( "", 0, UTF ); \ + av_push_simple( trie_words, tmp ); \ + }); \ + \ + curword++; \ + trie->wordinfo[curword].prev = 0; \ + trie->wordinfo[curword].len = wordlen; \ + trie->wordinfo[curword].accept = state; \ + \ + if ( noper_next < tail ) { \ + if (!trie->jump) \ + trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ + sizeof(U16) ); \ + trie->jump[curword] = (U16)(noper_next - convert); \ + if (!jumper) \ + jumper = noper_next; \ + if (!nextbranch) \ + nextbranch= regnext(cur); \ + } \ + \ + if ( dupe ) { \ + /* It's a dupe. Pre-insert into the wordinfo[].prev */\ + /* chain, so that when the bits of chain are later */\ + /* linked together, the dups appear in the chain */\ + trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ + trie->wordinfo[dupe].prev = curword; \ + } else { \ + /* we haven't inserted this word yet. */ \ + trie->states[ state ].wordnum = curword; \ + } \ +} STMT_END + + +#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ + ( ( base + charid >= ucharcount \ + && base + charid < ubound \ + && state == trie->trans[ base - ucharcount + charid ].check \ + && trie->trans[ base - ucharcount + charid ].next ) \ + ? trie->trans[ base - ucharcount + charid ].next \ + : ( state==1 ? special : 0 ) \ + ) + +#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ +STMT_START { \ + TRIE_BITMAP_SET(trie, uvc); \ + /* store the folded codepoint */ \ + if ( folder ) \ + TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ + \ + if ( !UTF ) { \ + /* store first byte of utf8 representation of */ \ + /* variant codepoints */ \ + if (! UVCHR_IS_INVARIANT(uvc)) { \ + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ + } \ + } \ +} STMT_END + +I32 +Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, + regnode *first, regnode *last, regnode *tail, + U32 word_count, U32 flags, U32 depth) +{ + /* first pass, loop through and scan words */ + reg_trie_data *trie; + HV *widecharmap = NULL; + AV *revcharmap = newAV(); + regnode *cur; + STRLEN len = 0; + UV uvc = 0; + U16 curword = 0; + U32 next_alloc = 0; + regnode *jumper = NULL; + regnode *nextbranch = NULL; + regnode *convert = NULL; + U32 *prev_states; /* temp array mapping each state to previous one */ + /* we just use folder as a flag in utf8 */ + const U8 * folder = NULL; + + /* in the below reg_add_data call we are storing either 'tu' or 'tuaa' + * which stands for one trie structure, one hash, optionally followed + * by two arrays */ +#ifdef DEBUGGING + const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa")); + AV *trie_words = NULL; + /* along with revcharmap, this only used during construction but both are + * useful during debugging so we store them in the struct when debugging. + */ +#else + const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu")); + STRLEN trie_charcount=0; +#endif + SV *re_trie_maxbuff; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_MAKE_TRIE; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + switch (flags) { + case EXACT: case EXACT_REQ8: case EXACTL: break; + case EXACTFAA: + case EXACTFUP: + case EXACTFU: + case EXACTFLU8: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); + } + + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); + trie->refcount = 1; + trie->startstate = 1; + trie->wordcount = word_count; + RExC_rxi->data->data[ data_slot ] = (void*)trie; + trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); + if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) + trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); + trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( + trie->wordcount+1, sizeof(reg_trie_wordinfo)); + + DEBUG_r({ + trie_words = newAV(); + }); + + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); + assert(re_trie_maxbuff); + if (!SvIOK(re_trie_maxbuff)) { + sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); + } + DEBUG_TRIE_COMPILE_r({ + Perl_re_indentf( aTHX_ + "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + depth+1, + REG_NODE_NUM(startbranch), REG_NODE_NUM(first), + REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); + }); + + /* Find the node we are going to overwrite */ + if ( first == startbranch && OP( last ) != BRANCH ) { + /* whole branch chain */ + convert = first; + } else { + /* branch sub-chain */ + convert = REGNODE_AFTER( first ); + } + + /* -- First loop and Setup -- + + We first traverse the branches and scan each word to determine if it + contains widechars, and how many unique chars there are, this is + important as we have to build a table with at least as many columns as we + have unique chars. + + We use an array of integers to represent the character codes 0..255 + (trie->charmap) and we use a an HV* to store Unicode characters. We use + the native representation of the character value as the key and IV's for + the coded index. + + *TODO* If we keep track of how many times each character is used we can + remap the columns so that the table compression later on is more + efficient in terms of memory by ensuring the most common value is in the + middle and the least common are on the outside. IMO this would be better + than a most to least common mapping as theres a decent chance the most + common letter will share a node with the least common, meaning the node + will not be compressible. With a middle is most common approach the worst + case is when we have the least common nodes twice. + + */ + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + regnode *noper = REGNODE_AFTER( cur ); + const U8 *uc; + const U8 *e; + int foldlen = 0; + U32 wordlen = 0; /* required init */ + STRLEN minchars = 0; + STRLEN maxchars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the + bitmap?*/ + + if (OP(noper) == NOTHING) { + /* skip past a NOTHING at the start of an alternation + * eg, /(?:)a|(?:b)/ should be the same as /a|b/ + * + * If the next node is not something we are supposed to process + * we will just ignore it due to the condition guarding the + * next block. + */ + + regnode *noper_next= regnext(noper); + if (noper_next < tail) + noper= noper_next; + } + + if ( noper < tail + && ( OP(noper) == flags + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 + || OP(noper) == EXACTFUP)))) + { + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } + + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + if (OP( noper ) == EXACTFUP) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); + } + } + + for ( ; uc < e ; uc += len ) { /* Look at each char in the current + branch */ + TRIE_CHARCOUNT(trie)++; + TRIE_READ_CHAR; + + /* TRIE_READ_CHAR returns the current character, or its fold if /i + * is in effect. Under /i, this character can match itself, or + * anything that folds to it. If not under /i, it can match just + * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN + * all fold to k, and all are single characters. But some folds + * expand to more than one character, so for example LATIN SMALL + * LIGATURE FFI folds to the three character sequence 'ffi'. If + * the string beginning at 'uc' is 'ffi', it could be matched by + * three characters, or just by the one ligature character. (It + * could also be matched by two characters: LATIN SMALL LIGATURE FF + * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). + * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also + * match.) The trie needs to know the minimum and maximum number + * of characters that could match so that it can use size alone to + * quickly reject many match attempts. The max is simple: it is + * the number of folded characters in this branch (since a fold is + * never shorter than what folds to it. */ + + maxchars++; + + /* And the min is equal to the max if not under /i (indicated by + * 'folder' being NULL), or there are no multi-character folds. If + * there is a multi-character fold, the min is incremented just + * once, for the character that folds to the sequence. Each + * character in the sequence needs to be added to the list below of + * characters in the trie, but we count only the first towards the + * min number of characters needed. This is done through the + * variable 'foldlen', which is returned by the macros that look + * for these sequences as the number of bytes the sequence + * occupies. Each time through the loop, we decrement 'foldlen' by + * how many bytes the current char occupies. Only when it reaches + * 0 do we increment 'minchars' or look for another multi-character + * sequence. */ + if (folder == NULL) { + minchars++; + } + else if (foldlen > 0) { + foldlen -= (UTF) ? UTF8SKIP(uc) : 1; + } + else { + minchars++; + + /* See if *uc is the beginning of a multi-character fold. If + * so, we decrement the length remaining to look at, to account + * for the current character this iteration. (We can use 'uc' + * instead of the fold returned by TRIE_READ_CHAR because the + * macro is smart enough to account for any unfolded + * characters. */ + if (UTF) { + if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { + foldlen -= UTF8SKIP(uc); + } + } + else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { + foldlen--; + } + } + + /* The current character (and any potential folds) should be added + * to the possible matching characters for this position in this + * branch */ + if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } + if ( !trie->charmap[ uvc ] ) { + trie->charmap[ uvc ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( uvc ); + } + if ( set_bit ) { + /* store the codepoint in the bitmap, and its folded + * equivalent. */ + TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); + set_bit = 0; /* We've done our bit :-) */ + } + } else { + + /* XXX We could come up with the list of code points that fold + * to this using PL_utf8_foldclosures, except not for + * multi-char folds, as there may be multiple combinations + * there that could work, which needs to wait until runtime to + * resolve (The comment about LIGATURE FFI above is such an + * example */ + + SV** svpp; + if ( !widecharmap ) + widecharmap = newHV(); + + svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); + + if ( !svpp ) + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); + + if ( !SvTRUE( *svpp ) ) { + sv_setiv( *svpp, ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR(uvc); + } + } + } /* end loop through characters in this branch of the trie */ + + /* We take the min and max for this branch and combine to find the min + * and max for all branches processed so far */ + if( cur == first ) { + trie->minlen = minchars; + trie->maxlen = maxchars; + } else if (minchars < trie->minlen) { + trie->minlen = minchars; + } else if (maxchars > trie->maxlen) { + trie->maxlen = maxchars; + } + } /* end first pass */ + DEBUG_TRIE_COMPILE_r( + Perl_re_indentf( aTHX_ + "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + depth+1, + ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, + (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, + (int)trie->minlen, (int)trie->maxlen ) + ); + + /* + We now know what we are dealing with in terms of unique chars and + string sizes so we can calculate how much memory a naive + representation using a flat table will take. If it's over a reasonable + limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory + conservative but potentially much slower representation using an array + of lists. + + At the end we convert both representations into the same compressed + form that will be used in regexec.c for matching with. The latter + is a form that cannot be used to construct with but has memory + properties similar to the list form and access properties similar + to the table form making it both suitable for fast searches and + small enough that its feasable to store for the duration of a program. + + See the comment in the code where the compressed table is produced + inplace from the flat tabe representation for an explanation of how + the compression works. + + */ + + + Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); + prev_states[1] = 0; + + if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) + > SvIV(re_trie_maxbuff) ) + { + /* + Second Pass -- Array Of Lists Representation + + Each state will be represented by a list of charid:state records + (reg_trie_trans_le) the first such element holds the CUR and LEN + points of the allocated array. (See defines above). + + We build the initial structure using the lists, and then convert + it into the compressed table form which allows faster lookups + (but cant be modified once converted). + */ + + STRLEN transcount = 1; + + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", + depth+1)); + + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + TRIE_LIST_NEW(1); + next_alloc = 2; + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = REGNODE_AFTER( cur ); + U32 state = 1; /* required init */ + U16 charid = 0; /* sanity init */ + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next < tail) + noper= noper_next; + /* we will undo this assignment if noper does not + * point at a trieable type in the else clause of + * the following statement. */ + } + + if ( noper < tail + && ( OP(noper) == flags + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 + || OP(noper) == EXACTFUP)))) + { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV** const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + if ( !svpp ) { + charid = 0; + } else { + charid=(U16)SvIV( *svpp ); + } + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + if ( charid ) { + + U16 check; + U32 newstate = 0; + + charid--; + if ( !trie->states[ state ].trans.list ) { + TRIE_LIST_NEW( state ); + } + for ( check = 1; + check <= TRIE_LIST_USED( state ); + check++ ) + { + if ( TRIE_LIST_ITEM( state, check ).forid + == charid ) + { + newstate = TRIE_LIST_ITEM( state, check ).newstate; + break; + } + } + if ( ! newstate ) { + newstate = next_alloc++; + prev_states[newstate] = state; + TRIE_LIST_PUSH( state, charid, newstate ); + transcount++; + } + state = newstate; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); + } + } + } else { + /* If we end up here it is because we skipped past a NOTHING, but did not end up + * on a trieable type. So we need to reset noper back to point at the first regop + * in the branch before we call TRIE_HANDLE_WORD() + */ + noper= REGNODE_AFTER(cur); + } + TRIE_HANDLE_WORD(state); + + } /* end second pass */ + + /* next alloc is the NEXT state to be allocated */ + trie->statecount = next_alloc; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, + next_alloc + * sizeof(reg_trie_state) ); + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, + revcharmap, next_alloc, + depth+1) + ); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); + { + U32 state; + U32 tp = 0; + U32 zp = 0; + + + for( state=1 ; state < next_alloc ; state ++ ) { + U32 base=0; + + /* + DEBUG_TRIE_COMPILE_MORE_r( + Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) + ); + */ + + if (trie->states[state].trans.list) { + U16 minid=TRIE_LIST_ITEM( state, 1).forid; + U16 maxid=minid; + U16 idx; + + for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U16 forid = TRIE_LIST_ITEM( state, idx).forid; + if ( forid < minid ) { + minid=forid; + } else if ( forid > maxid ) { + maxid=forid; + } + } + if ( transcount < tp + maxid - minid + 1) { + transcount *= 2; + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, + transcount + * sizeof(reg_trie_trans) ); + Zero( trie->trans + (transcount / 2), + transcount / 2, + reg_trie_trans ); + } + base = trie->uniquecharcount + tp - minid; + if ( maxid == minid ) { + U32 set = 0; + for ( ; zp < tp ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + base = trie->uniquecharcount + zp - minid; + trie->trans[ zp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ zp ].check = state; + set = 1; + break; + } + } + if ( !set ) { + trie->trans[ tp ].next = TRIE_LIST_ITEM( state, + 1).newstate; + trie->trans[ tp ].check = state; + tp++; + zp = tp; + } + } else { + for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { + const U32 tid = base + - trie->uniquecharcount + + TRIE_LIST_ITEM( state, idx ).forid; + trie->trans[ tid ].next = TRIE_LIST_ITEM( state, + idx ).newstate; + trie->trans[ tid ].check = state; + } + tp += ( maxid - minid + 1 ); + } + Safefree(trie->states[ state ].trans.list); + } + /* + DEBUG_TRIE_COMPILE_MORE_r( + Perl_re_printf( aTHX_ " base: %d\n",base); + ); + */ + trie->states[ state ].trans.base=base; + } + trie->lasttrans = tp + 1; + } + } else { + /* + Second Pass -- Flat Table Representation. + + we dont use the 0 slot of either trans[] or states[] so we add 1 to + each. We know that we will need Charcount+1 trans at most to store + the data (one row per char at worst case) So we preallocate both + structures assuming worst case. + + We then construct the trie using only the .next slots of the entry + structs. + + We use the .check field of the first entry of the node temporarily + to make compression both faster and easier by keeping track of how + many non zero fields are in the node. + + Since trans are numbered from 1 any 0 pointer in the table is a FAIL + transition. + + There are two terms at use here: state as a TRIE_NODEIDX() which is + a number representing the first entry of the node, and state as a + TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) + and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) + if there are 2 entrys per node. eg: + + A B A B + 1. 2 4 1. 3 7 + 2. 0 3 3. 0 5 + 3. 0 0 5. 0 0 + 4. 0 0 7. 0 0 + + The table is internally in the right hand, idx form. However as we + also have to deal with the states array which is indexed by nodenum + we have to use TRIE_NODENUM() to convert. + + */ + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", + depth+1)); + + trie->trans = (reg_trie_trans *) + PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) + * trie->uniquecharcount + 1, + sizeof(reg_trie_trans) ); + trie->states = (reg_trie_state *) + PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, + sizeof(reg_trie_state) ); + next_alloc = trie->uniquecharcount + 1; + + + for ( cur = first ; cur < last ; cur = regnext( cur ) ) { + + regnode *noper = REGNODE_AFTER( cur ); + + U32 state = 1; /* required init */ + + U16 charid = 0; /* sanity init */ + U32 accept_state = 0; /* sanity init */ + + U32 wordlen = 0; /* required init */ + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next < tail) + noper= noper_next; + /* we will undo this assignment if noper does not + * point at a trieable type in the else clause of + * the following statement. */ + } + + if ( noper < tail + && ( OP(noper) == flags + || (flags == EXACT && OP(noper) == EXACT_REQ8) + || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 + || OP(noper) == EXACTFUP)))) + { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + + for ( ; uc < e ; uc += len ) { + + TRIE_READ_CHAR; + + if ( uvc < 256 ) { + charid = trie->charmap[ uvc ]; + } else { + SV* const * const svpp = hv_fetch( widecharmap, + (char*)&uvc, + sizeof( UV ), + 0); + charid = svpp ? (U16)SvIV(*svpp) : 0; + } + if ( charid ) { + charid--; + if ( !trie->trans[ state + charid ].next ) { + trie->trans[ state + charid ].next = next_alloc; + trie->trans[ state ].check++; + prev_states[TRIE_NODENUM(next_alloc)] + = TRIE_NODENUM(state); + next_alloc += trie->uniquecharcount; + } + state = trie->trans[ state + charid ].next; + } else { + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); + } + /* charid is now 0 if we dont know the char read, or + * nonzero if we do */ + } + } else { + /* If we end up here it is because we skipped past a NOTHING, but did not end up + * on a trieable type. So we need to reset noper back to point at the first regop + * in the branch before we call TRIE_HANDLE_WORD(). + */ + noper= REGNODE_AFTER(cur); + } + accept_state = TRIE_NODENUM( state ); + TRIE_HANDLE_WORD(accept_state); + + } /* end second pass */ + + /* and now dump it out before we compress it */ + DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, + revcharmap, + next_alloc, depth+1)); + + { + /* + * Inplace compress the table.* + + For sparse data sets the table constructed by the trie algorithm will + be mostly 0/FAIL transitions or to put it another way mostly empty. + (Note that leaf nodes will not contain any transitions.) + + This algorithm compresses the tables by eliminating most such + transitions, at the cost of a modest bit of extra work during lookup: + + - Each states[] entry contains a .base field which indicates the + index in the state[] array wheres its transition data is stored. + + - If .base is 0 there are no valid transitions from that node. + + - If .base is nonzero then charid is added to it to find an entry in + the trans array. + + -If trans[states[state].base+charid].check!=state then the + transition is taken to be a 0/Fail transition. Thus if there are fail + transitions at the front of the node then the .base offset will point + somewhere inside the previous nodes data (or maybe even into a node + even earlier), but the .check field determines if the transition is + valid. + + XXX - wrong maybe? + The following process inplace converts the table to the compressed + table: We first do not compress the root node 1,and mark all its + .check pointers as 1 and set its .base pointer as 1 as well. This + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. + + - We set 'pos' to indicate the first entry of the second node. + + - We then iterate over the columns of the node, finding the first and + last used entry at l and m. We then copy l..m into pos..(pos+m-l), + and set the .check pointers accordingly, and advance pos + appropriately and repreat for the next node. Note that when we copy + the next pointers we have to convert them from the original + NODEIDX form to NODENUM form as the former is not valid post + compression. + + - If a node has no transitions used we mark its base as 0 and do not + advance the pos pointer. + + - If a node only has one transition we use a second pointer into the + structure to fill in allocated fail transitions from other states. + This pointer is independent of the main pointer and scans forward + looking for null transitions that are allocated to a state. When it + finds one it writes the single transition into the "hole". If the + pointer doesnt find one the single transition is appended as normal. + + - Once compressed we can Renew/realloc the structures to release the + excess space. + + See "Table-Compression Methods" in sec 3.9 of the Red Dragon, + specifically Fig 3.47 and the associated pseudocode. + + demq + */ + const U32 laststate = TRIE_NODENUM( next_alloc ); + U32 state, charid; + U32 pos = 0, zp=0; + trie->statecount = laststate; + + for ( state = 1 ; state < laststate ; state++ ) { + U8 flag = 0; + const U32 stateidx = TRIE_NODEIDX( state ); + const U32 o_used = trie->trans[ stateidx ].check; + U32 used = trie->trans[ stateidx ].check; + trie->trans[ stateidx ].check = 0; + + for ( charid = 0; + used && charid < trie->uniquecharcount; + charid++ ) + { + if ( flag || trie->trans[ stateidx + charid ].next ) { + if ( trie->trans[ stateidx + charid ].next ) { + if (o_used == 1) { + for ( ; zp < pos ; zp++ ) { + if ( ! trie->trans[ zp ].next ) { + break; + } + } + trie->states[ state ].trans.base + = zp + + trie->uniquecharcount + - charid ; + trie->trans[ zp ].next + = SAFE_TRIE_NODENUM( trie->trans[ stateidx + + charid ].next ); + trie->trans[ zp ].check = state; + if ( ++zp > pos ) pos = zp; + break; + } + used--; + } + if ( !flag ) { + flag = 1; + trie->states[ state ].trans.base + = pos + trie->uniquecharcount - charid ; + } + trie->trans[ pos ].next + = SAFE_TRIE_NODENUM( + trie->trans[ stateidx + charid ].next ); + trie->trans[ pos ].check = state; + pos++; + } + } + } + trie->lasttrans = pos + 1; + trie->states = (reg_trie_state *) + PerlMemShared_realloc( trie->states, laststate + * sizeof(reg_trie_state) ); + DEBUG_TRIE_COMPILE_MORE_r( + Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", + depth+1, + (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + + 1 ), + (IV)next_alloc, + (IV)pos, + ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); + ); + + } /* end table compress */ + } + DEBUG_TRIE_COMPILE_MORE_r( + Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", + depth+1, + (UV)trie->statecount, + (UV)trie->lasttrans) + ); + /* resize the trans array to remove unused space */ + trie->trans = (reg_trie_trans *) + PerlMemShared_realloc( trie->trans, trie->lasttrans + * sizeof(reg_trie_trans) ); + + { /* Modify the program and insert the new TRIE node */ + U8 nodetype =(U8) flags; + char *str=NULL; + +#ifdef DEBUGGING + regnode *optimize = NULL; +#endif /* DEBUGGING */ + /* + This means we convert either the first branch or the first Exact, + depending on whether the thing following (in 'last') is a branch + or not and whther first is the startbranch (ie is it a sub part of + the alternation or is it the whole thing.) + Assuming its a sub part we convert the EXACT otherwise we convert + the whole branch sequence, including the first. + */ + /* Find the node we are going to overwrite */ + if ( first != startbranch || OP( last ) == BRANCH ) { + /* branch sub-chain */ + NEXT_OFF( first ) = (U16)(last - first); + /* whole branch chain */ + } + /* But first we check to see if there is a common prefix we can + split out as an EXACT and put in front of the TRIE node. */ + trie->startstate= 1; + if ( trie->bitmap && !widecharmap && !trie->jump ) { + /* we want to find the first state that has more than + * one transition, if that state is not the first state + * then we have a common prefix which we can remove. + */ + U32 state; + for ( state = 1 ; state < trie->statecount-1 ; state++ ) { + U32 ofs = 0; + I32 first_ofs = -1; /* keeps track of the ofs of the first + transition, -1 means none */ + U32 count = 0; + const U32 base = trie->states[ state ].trans.base; + + /* does this state terminate an alternation? */ + if ( trie->states[state].wordnum ) + count = 1; + + for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { + if ( ( base + ofs >= trie->uniquecharcount ) && + ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && + trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) + { + if ( ++count > 1 ) { + /* we have more than one transition */ + SV **tmp; + U8 *ch; + /* if this is the first state there is no common prefix + * to extract, so we can exit */ + if ( state == 1 ) break; + tmp = av_fetch_simple( revcharmap, ofs, 0); + ch = (U8*)SvPV_nolen_const( *tmp ); + + /* if we are on count 2 then we need to initialize the + * bitmap, and store the previous char if there was one + * in it*/ + if ( count == 2 ) { + /* clear the bitmap */ + Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); + DEBUG_OPTIMISE_r( + Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", + depth+1, + (UV)state)); + if (first_ofs >= 0) { + SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0); + const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); + + TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); + DEBUG_OPTIMISE_r( + Perl_re_printf( aTHX_ "%s", (char*)ch) + ); + } + } + /* store the current firstchar in the bitmap */ + TRIE_BITMAP_SET_FOLDED(trie,*ch, folder); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); + } + first_ofs = ofs; + } + } + if ( count == 1 ) { + /* This state has only one transition, its transition is part + * of a common prefix - we need to concatenate the char it + * represents to what we have so far. */ + SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0); + STRLEN len; + char *ch = SvPV( *tmp, len ); + DEBUG_OPTIMISE_r({ + SV *sv=sv_newmortal(); + Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", + depth+1, + (UV)state, (UV)first_ofs, + pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, + PL_colors[0], PL_colors[1], + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | + PERL_PV_ESCAPE_FIRSTCHAR + ) + ); + }); + if ( state==1 ) { + OP( convert ) = nodetype; + str=STRING(convert); + setSTR_LEN(convert, 0); + } + assert( ( STR_LEN(convert) + len ) < 256 ); + setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); + while (len--) + *str++ = *ch++; + } else { +#ifdef DEBUGGING + if (state>1) + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); +#endif + break; + } + } + trie->prefixlen = (state-1); + if (str) { + regnode *n = REGNODE_AFTER(convert); + assert( n - convert <= U16_MAX ); + NEXT_OFF(convert) = n - convert; + trie->startstate = state; + trie->minlen -= (state - 1); + trie->maxlen -= (state - 1); +#ifdef DEBUGGING + /* At least the UNICOS C compiler choked on this + * being argument to DEBUG_r(), so let's just have + * it right here. */ + if ( +#ifdef PERL_EXT_RE_BUILD + 1 +#else + DEBUG_r_TEST +#endif + ) { + U32 word = trie->wordcount; + while (word--) { + SV ** const tmp = av_fetch_simple( trie_words, word, 0 ); + if (tmp) { + if ( STR_LEN(convert) <= SvCUR(*tmp) ) + sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); + else + sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); + } + } + } +#endif + if (trie->maxlen) { + convert = n; + } else { + NEXT_OFF(convert) = (U16)(tail - convert); + DEBUG_r(optimize= n); + } + } + } + if (!jumper) + jumper = last; + if ( trie->maxlen ) { + NEXT_OFF( convert ) = (U16)(tail - convert); + ARG_SET( convert, data_slot ); + /* Store the offset to the first unabsorbed branch in + jump[0], which is otherwise unused by the jump logic. + We use this when dumping a trie and during optimisation. */ + if (trie->jump) + trie->jump[0] = (U16)(nextbranch - convert); + + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + { + OP( convert ) = TRIEC; + Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); + PerlMemShared_free(trie->bitmap); + trie->bitmap= NULL; + } else + OP( convert ) = TRIE; + + /* store the type in the flags */ + convert->flags = nodetype; + DEBUG_r({ + optimize = convert + + NODE_STEP_REGNODE + + REGNODE_ARG_LEN( OP( convert ) ); + }); + /* XXX We really should free up the resource in trie now, + as we won't use them - (which resources?) dmq */ + } + /* needed for dumping*/ + DEBUG_r(if (optimize) { + /* + Try to clean up some of the debris left after the + optimisation. + */ + while( optimize < jumper ) { + OP( optimize ) = OPTIMIZED; + optimize++; + } + }); + } /* end node insert */ + + /* Finish populating the prev field of the wordinfo array. Walk back + * from each accept state until we find another accept state, and if + * so, point the first word's .prev field at the second word. If the + * second already has a .prev field set, stop now. This will be the + * case either if we've already processed that word's accept state, + * or that state had multiple words, and the overspill words were + * already linked up earlier. + */ + { + U16 word; + U32 state; + U16 prev; + + for (word=1; word <= trie->wordcount; word++) { + prev = 0; + if (trie->wordinfo[word].prev) + continue; + state = trie->wordinfo[word].accept; + while (state) { + state = prev_states[state]; + if (!state) + break; + prev = trie->states[state].wordnum; + if (prev) + break; + } + trie->wordinfo[word].prev = prev; + } + Safefree(prev_states); + } + + + /* and now dump out the compressed format */ + DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); + + RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; +#ifdef DEBUGGING + RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; + RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; +#else + SvREFCNT_dec_NN(revcharmap); +#endif + return trie->jump + ? MADE_JUMP_TRIE + : trie->startstate>1 + ? MADE_EXACT_TRIE + : MADE_TRIE; +} + +regnode * +Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) +{ +/* The Trie is constructed and compressed now so we can build a fail array if + * it's needed + + This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and + 3.32 in the + "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, + Ullman 1985/88 + ISBN 0-201-10088-6 + + We find the fail state for each state in the trie, this state is the longest + proper suffix of the current state's 'word' that is also a proper prefix of + another word in our trie. State 1 represents the word '' and is thus the + default fail state. This allows the DFA not to have to restart after its + tried and failed a word at a given point, it simply continues as though it + had been matching the other word in the first place. + Consider + 'abcdgu'=~/abcdefg|cdgu/ + When we get to 'd' we are still matching the first word, we would encounter + 'g' which would fail, which would bring us to the state representing 'd' in + the second word where we would try 'g' and succeed, proceeding to match + 'cdgu'. + */ + /* add a fail transition */ + const U32 trie_offset = ARG(source); + reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; + U32 *q; + const U32 ucharcount = trie->uniquecharcount; + const U32 numstates = trie->statecount; + const U32 ubound = trie->lasttrans + ucharcount; + U32 q_read = 0; + U32 q_write = 0; + U32 charid; + U32 base = trie->states[ 1 ].trans.base; + U32 *fail; + reg_ac_data *aho; + const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; +#ifndef DEBUGGING + PERL_UNUSED_ARG(depth); +#endif + + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source, op, struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source, op, struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ + + ARG_SET( stclass, data_slot ); + aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); + RExC_rxi->data->data[ data_slot ] = (void*)aho; + aho->trie=trie_offset; + aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); + Copy( trie->states, aho->states, numstates, reg_trie_state ); + Newx( q, numstates, U32); + aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); + aho->refcount = 1; + fail = aho->fail; + /* initialize fail[0..1] to be 1 so that we always have + a valid final fail state */ + fail[ 0 ] = fail[ 1 ] = 1; + + for ( charid = 0; charid < ucharcount ; charid++ ) { + const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); + if ( newstate ) { + q[ q_write ] = newstate; + /* set to point at the root */ + fail[ q[ q_write++ ] ]=1; + } + } + while ( q_read < q_write) { + const U32 cur = q[ q_read++ % numstates ]; + base = trie->states[ cur ].trans.base; + + for ( charid = 0 ; charid < ucharcount ; charid++ ) { + const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); + if (ch_state) { + U32 fail_state = cur; + U32 fail_base; + do { + fail_state = fail[ fail_state ]; + fail_base = aho->states[ fail_state ].trans.base; + } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); + + fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); + fail[ ch_state ] = fail_state; + if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) + { + aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; + } + q[ q_write++ % numstates] = ch_state; + } + } + } + /* restore fail[0..1] to 0 so that we "fall out" of the AC loop + when we fail in state 1, this allows us to use the + charclass scan to find a valid start char. This is based on the principle + that theres a good chance the string being searched contains lots of stuff + that cant be a start char. + */ + fail[ 0 ] = fail[ 1 ] = 0; + DEBUG_TRIE_COMPILE_r({ + Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", + depth, (UV)numstates + ); + for( q_read=1; q_read<numstates; q_read++ ) { + Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); + } + Perl_re_printf( aTHX_ "\n"); + }); + Safefree(q); + /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; +} diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 00c4983e80..cb17a110dc 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -869,7 +869,7 @@ foreach my $charset (get_supported_code_pages()) { $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A; print $out_fh <<"EOT"; -# ifdef PERL_IN_REGCOMP_C +# ifdef PERL_IN_REGCOMP_ANY # define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */ # endif EOT @@ -931,7 +931,7 @@ $count = 0x110000 - $count; print $out_fh <<~"EOT"; /* The number of code points not matching \\pC */ - #ifdef PERL_IN_REGCOMP_C + #ifdef PERL_IN_REGCOMP_ANY # define NON_OTHER_COUNT $count #endif EOT @@ -2,7 +2,7 @@ */ /* - * One Ring to rule them all, One Ring to find them + * One Ring to rule them all, One Ring to find them * * [p.v of _The Lord of the Rings_, opening poem] * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"] @@ -40,22 +40,22 @@ /* * pregcomp and pregexec -- regsub and regerror are not used in perl * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. + * Copyright (c) 1986 by University of Toronto. + * Written by Henry Spencer. Not derived from licensed software. * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: + * Permission is granted to anyone to use this software for any + * purpose on any computer system, and to redistribute it freely, + * subject to the following restrictions: * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. + * 1. The author is not responsible for the consequences of use of + * this software, no matter how awful, even if they arise + * from defects in it. * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. + * 2. The origin of this software must not be misrepresented, either + * by explicit claim or by omission. * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. + * 3. Altered versions must be plainly marked as such, and must not + * be misrepresented as being the original software. * **** Alterations to Henry's code are... **** @@ -71,6 +71,7 @@ * regular-expression syntax might require a total rethink. */ #include "EXTERN.h" +#define PERL_IN_REGEX_ENGINE #define PERL_IN_REGEXEC_C #include "perl.h" @@ -119,7 +120,7 @@ static const char non_utf8_target_but_utf8_required[] } STMT_END #ifndef STATIC -#define STATIC static +#define STATIC static #endif /* @@ -138,8 +139,8 @@ static const char non_utf8_target_but_utf8_required[] #define HOPBACK3(pos, off, lim) \ (reginfo->is_utf8_target \ ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ - : (pos - off >= lim) \ - ? (U8*)pos - off \ + : (pos - off >= lim) \ + ? (U8*)pos - off \ : NULL) #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) @@ -167,7 +168,7 @@ static const char non_utf8_target_but_utf8_required[] : (U8*)(pos + off)) #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim)) -#define PLACEHOLDER /* Something for the preprocessor to grab onto */ +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ /* for use after a quantifier and before an EXACT-like node -- japhy */ @@ -268,7 +269,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) PL_savestack_ix += paren_elems_to_push; DEBUG_BUFFERS_r({ - I32 p; + I32 p; for (p = parenfloor + 1; p <= (I32)maxopenparen; p++) { Perl_re_exec_indentf(aTHX_ " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n", @@ -441,7 +442,7 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) PL_savestack_ix = tmpix; } -#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ +#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ STATIC bool S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) @@ -889,7 +890,7 @@ Perl_re_intuit_start(pTHX_ U8 other_ix = 1 - prog->substrs->check_ix; bool ml_anch = 0; char *other_last = strpos;/* latest pos 'other' substr already checked to */ - char *check_at = NULL; /* check substr found at this pos */ + char *check_at = NULL; /* check substr found at this pos */ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ @@ -1067,7 +1068,7 @@ Perl_re_intuit_start(pTHX_ end_shift = prog->check_end_shift; -#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(rx)); @@ -1306,7 +1307,7 @@ Perl_re_intuit_start(pTHX_ } s = HOP3c(rx_origin, other->min_offset, strend); - if (s < other_last) /* These positions already checked */ + if (s < other_last) /* These positions already checked */ s = other_last; must = utf8_target ? other->utf8_substr : other->substr; @@ -1665,7 +1666,7 @@ Perl_re_intuit_start(pTHX_ cannot start at strpos. */ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n")); - ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ } else { /* The found rx_origin position does not prohibit matching at @@ -1674,11 +1675,11 @@ Perl_re_intuit_start(pTHX_ * zero, free it. */ if (!(prog->intflags & PREGf_NAUGHTY) && (utf8_target ? ( - prog->check_utf8 /* Could be deleted already */ + prog->check_utf8 /* Could be deleted already */ && --BmUSEFUL(prog->check_utf8) < 0 && (prog->check_utf8 == prog->float_utf8) ) : ( - prog->check_substr /* Could be deleted already */ + prog->check_substr /* Could be deleted already */ && --BmUSEFUL(prog->check_substr) < 0 && (prog->check_substr == prog->float_substr) ))) @@ -1688,9 +1689,9 @@ Perl_re_intuit_start(pTHX_ /* XXX Does the destruction order has to change with utf8_target? */ SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); - prog->check_substr = prog->check_utf8 = NULL; /* disable */ - prog->float_substr = prog->float_utf8 = NULL; /* clear */ - check = NULL; /* abort */ + prog->check_substr = prog->check_utf8 = NULL; /* disable */ + prog->float_substr = prog->float_utf8 = NULL; /* clear */ + check = NULL; /* abort */ /* XXXX This is a remnant of the old implementation. It looks wasteful, since now INTUIT can use many other heuristics. */ @@ -1704,8 +1705,8 @@ Perl_re_intuit_start(pTHX_ return rx_origin; - fail_finish: /* Substring not found */ - if (prog->check_substr || prog->check_utf8) /* could be removed already */ + fail_finish: /* Substring not found */ + if (prog->check_substr || prog->check_utf8) /* could be removed already */ BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ fail: DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n", @@ -2202,8 +2203,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; char *pat_string; /* The pattern's exactish string */ - char *pat_end; /* ptr to end char of pat_string */ - re_fold_t folder; /* Function for computing non-utf8 folds */ + char *pat_end; /* ptr to end char of pat_string */ + re_fold_t folder; /* Function for computing non-utf8 folds */ const U8 *fold_array; /* array for folding ords < 256 */ STRLEN ln; STRLEN lnc; @@ -2473,7 +2474,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * Unicode semantics and the german sharp ss, which hence should not be * compiled into a node that gets here. */ pat_string = STRINGs(c); - ln = STR_LENs(c); /* length to match in octets/bytes */ + ln = STR_LENs(c); /* length to match in octets/bytes */ /* We know that we have to match at least 'ln' bytes (which is the same * as characters, since not utf8). If we have to match 3 characters, @@ -2598,7 +2599,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * can have the same fold, or portion of a fold, or different- * length fold */ pat_string = STRINGs(c); - ln = STR_LENs(c); /* length to match in octets/bytes */ + ln = STR_LENs(c); /* length to match in octets/bytes */ pat_end = pat_string + ln; lnc = is_utf8_pat /* length to match in characters */ ? utf8_length((U8 *) pat_string, (U8 *) pat_end) @@ -3600,8 +3601,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, char *s; regnode *c; char *startpos; - SSize_t minlen; /* must match at least this many chars */ - SSize_t dontbother = 0; /* how many characters not to try at end */ + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -3755,7 +3756,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, RXp_MATCH_TAINTED_off(prog); RXp_MATCH_UTF8_set(prog, utf8_target); - reginfo->prog = rx; /* Yes, sorry that this is confusing. */ + reginfo->prog = rx; /* Yes, sorry that this is confusing. */ reginfo->intuit = 0; reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); reginfo->warned = FALSE; @@ -3964,7 +3965,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, SSize_t back_max; SSize_t back_min; char *last; - char *last1; /* Last position checked before */ + char *last1; /* Last position checked before */ #ifdef DEBUGGING int did_match = 0; #endif @@ -4006,14 +4007,14 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (back_min<0) { last = strend; } else { - last = HOP3c(strend, /* Cannot start after this */ + last = HOP3c(strend, /* Cannot start after this */ -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) last1 = HOPc(s, -1); else - last1 = s - 1; /* bogus */ + last1 = s - 1; /* bogus */ /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ @@ -4169,7 +4170,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (len) last = rninstr(s, strend, little, little + len); else - last = strend; /* matching "$" */ + last = strend; /* matching "$" */ } if (!last) { /* at one point this block contained a comment which was @@ -4188,7 +4189,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } if (minlen && (dontbother < minlen)) dontbother = minlen - 1; - strend -= dontbother; /* this one's always in bytes! */ + strend -= dontbother; /* this one's always in bytes! */ /* We don't know much -- general case. */ if (utf8_target) { for (;;) { @@ -4275,16 +4276,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, * Do inc before dec, in case old and new rex are the same */ #define SET_reg_curpm(Re2) \ if (reginfo->info_aux_eval) { \ - (void)ReREFCNT_inc(Re2); \ - ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ - PM_SETRE((PL_reg_curpm), (Re2)); \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ } /* - regtry - try match at specific point */ -STATIC bool /* 0 failure, 1 success */ +STATIC bool /* 0 failure, 1 success */ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) { CHECKPOINT lastcp; @@ -6366,7 +6367,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* cache heavy used fields of st in registers */ regnode *scan; regnode *next; - U32 n = 0; /* general value; init to avoid compiler warning */ + U32 n = 0; /* general value; init to avoid compiler warning */ U32 utmp = 0; /* tmp variable - valid for at most one opcode */ SSize_t ln = 0; /* len or last; init to avoid compiler warning */ SSize_t endref = 0; /* offset of end of backref when ln is start */ @@ -6378,7 +6379,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1 at EOS */ - bool result = 0; /* return value of S_regmatch */ + bool result = 0; /* return value of S_regmatch */ U32 depth = 0; /* depth of backtrack stack */ U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */ const U32 max_nochange_depth = @@ -6406,9 +6407,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops */ - bool sw = 0; /* the condition value in (?(cond)a|b) */ - bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ - int logical = 0; /* the following EVAL is: + bool sw = 0; /* the condition value in (?(cond)a|b) */ + bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */ + int logical = 0; /* the following EVAL is: 0: (?{...}) 1: (?(?{...})X|Y) 2: (??{...}) @@ -6419,8 +6420,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PAD* last_pad = NULL; dMULTICALL; U8 gimme = G_SCALAR; - CV *caller_cv = NULL; /* who called us */ - CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ U32 maxopenparen = 0; /* max '(' index seen so far */ int to_complement; /* Invert the result? */ char_class_number_ classnum; @@ -7790,7 +7791,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Match either CR LF or '.', as all the other possibilities * require utf8 */ - locinput++; /* Match the . or CR */ + locinput++; /* Match the . or CR */ if (nextbyte == '\r' /* And if it was CR, and the next is LF, match the LF */ && locinput < loceol @@ -7924,12 +7925,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) endref = rex->offs[n].end; reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ if (rex->lastparen < n || ln == -1 || endref == -1) - sayNO; /* Do not match unless seen CLOSEn. */ + sayNO; /* Do not match unless seen CLOSEn. */ if (ln == endref) break; s = reginfo->strbeg + ln; - if (type != REF /* REF can do byte comparison */ + if (type != REF /* REF can do byte comparison */ && (utf8_target || type == REFFU || type == REFFL)) { char * limit = loceol; @@ -8202,7 +8203,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * the savestack frame */ before = (IV)(SP-PL_stack_base); PL_op = nop; - CALLRUNOPS(aTHX); /* Scalar context. */ + CALLRUNOPS(aTHX); /* Scalar context. */ SPAGAIN; if ((IV)(SP-PL_stack_base) == before) ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */ @@ -8674,7 +8675,7 @@ NULL ST.B = next; ST.minmod = minmod; minmod = 0; - ST.count = -1; /* this will be updated by WHILEM */ + ST.count = -1; /* this will be updated by WHILEM */ ST.lastloc = NULL; /* this will be updated by WHILEM */ PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol, @@ -8916,13 +8917,13 @@ NULL #undef ST #define ST st->u.branch - case BRANCHJ: /* /(...|A|...)/ with long next pointer */ + case BRANCHJ: /* /(...|A|...)/ with long next pointer */ next = scan + ARG(scan); if (next == scan) next = NULL; /* FALLTHROUGH */ - case BRANCH: /* /(...|A|...)/ */ + case BRANCH: /* /(...|A|...)/ */ scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */ assert(scan); ST.lastparen = rex->lastparen; @@ -8988,7 +8989,7 @@ NULL #undef ST #define ST st->u.curlym - case CURLYM: /* /A{m,n}B/ where A is fixed-length */ + case CURLYM: /* /A{m,n}B/ where A is fixed-length */ /* This is an optimisation of CURLYX that enables us to push * only a single backtracking state, no matter how many matches @@ -9172,22 +9173,22 @@ NULL } \ } - case STAR: /* /A*B/ where A is width 1 char */ + case STAR: /* /A*B/ where A is width 1 char */ ST.paren = 0; ST.min = 0; ST.max = REG_INFTY; scan = REGNODE_AFTER_type(scan,tregnode_STAR); goto repeat; - case PLUS: /* /A+B/ where A is width 1 char */ + case PLUS: /* /A+B/ where A is width 1 char */ ST.paren = 0; ST.min = 1; ST.max = REG_INFTY; scan = REGNODE_AFTER_type(scan,tregnode_PLUS); goto repeat; - case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ - ST.paren = scan->flags; /* Which paren to set */ + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ ST.lastparen = rex->lastparen; ST.lastcloseparen = rex->lastcloseparen; if (ST.paren > maxopenparen) @@ -9208,7 +9209,7 @@ NULL goto repeat; - case CURLY: /* /A{m,n}B/ where A is width 1 char */ + case CURLY: /* /A{m,n}B/ where A is width 1 char */ ST.paren = 0; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ @@ -9471,7 +9472,7 @@ NULL /* we've just finished A in /(??{A})B/; now continue with B */ is_accepted= false; SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput); - st->u.eval.prev_rex = rex_sv; /* inner */ + st->u.eval.prev_rex = rex_sv; /* inner */ /* Save *all* the positions. */ st->u.eval.cp = regcppush(rex, 0, maxopenparen); @@ -9513,9 +9514,9 @@ NULL (long)(reginfo->till - startpos), PL_colors[5])); - sayNO_SILENT; /* Cannot match: too short. */ + sayNO_SILENT; /* Cannot match: too short. */ } - sayYES; /* Success! */ + sayYES; /* Success! */ case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH matches end at the right spot, required for @@ -9536,23 +9537,23 @@ NULL Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n", depth, PL_colors[4], PL_colors[5])); - sayYES; /* Success! */ + sayYES; /* Success! */ #undef ST #define ST st->u.ifmatch - case SUSPEND: /* (?>A) */ + case SUSPEND: /* (?>A) */ ST.wanted = 1; ST.start = locinput; ST.end = loceol; ST.count = 1; goto do_ifmatch; - case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ + case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */ ST.wanted = 0; goto ifmatch_trivial_fail_test; - case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ + case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */ ST.wanted = 1; ifmatch_trivial_fail_test: ST.prev_match_end= match_end; @@ -11739,9 +11740,430 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) return retval; } - #endif /* ifndef PERL_IN_XSUB_RE */ +/* Buffer logic. */ +SV* +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF; + + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak_no_modify(); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER; + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", + (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, + const U32 flags) +{ + SV *ret; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; + + if (rx && RXp_PAREN_NAMES(rx)) { + HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + AV * const retarray = (flags & RXapif_ALL) ? newAV_alloc_x(SvIVX(sv_dat)) : NULL; + for ( i=0; i<SvIVX(sv_dat); i++ ) { + if ((I32)(rx->nparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(r, nums[i], ret); + if (!retarray) + return ret; + } else { + if (retarray) + ret = newSV_type(SVt_NULL); + } + if (retarray) + av_push_simple(retarray, ret); + } + if (retarray) + return newRV_noinc(MUTABLE_SV(retarray)); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, + const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & RXapif_ALL) { + return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags); + if (sv) { + SvREFCNT_dec_NN(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; + + if ( rx && RXp_PAREN_NAMES(rx) ) { + (void)hv_iterinit(RXp_PAREN_NAMES(rx)); + + return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + DECLARE_AND_GET_RE_DEBUG_FLAGS; + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv = RXp_PAREN_NAMES(rx); + HE *temphe; + while ( (temphe = hv_iternext_flags(hv, 0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + return newSVhek(HeKEY_hek(temphe)); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) +{ + SV *ret; + AV *av; + SSize_t length; + struct regexp *const rx = ReANY(r); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; + + if (rx && RXp_PAREN_NAMES(rx)) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx))); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES)); + av = MUTABLE_AV(SvRV(ret)); + length = av_count(av); + SvREFCNT_dec_NN(ret); + return newSViv(length); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", + (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) +{ + struct regexp *const rx = ReANY(r); + AV *av = newAV(); + + PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; + + if (rx && RXp_PAREN_NAMES(rx)) { + HV *hv= RXp_PAREN_NAMES(rx); + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv, 0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + av_push_simple(av, newSVhek(HeKEY_hek(temphe))); + } + } + } + + return newRV_noinc(MUTABLE_SV(av)); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, + SV * const sv) +{ + struct regexp *const rx = ReANY(r); + char *s = NULL; + SSize_t i = 0; + SSize_t s1, t1; + I32 n = paren; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; + + if ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto ret_undef; + } + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ + i = rx->offs[0].start; + s = rx->subbeg; + } + else + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; + } + else + if (inRANGE(n, 0, (I32)rx->nparens) && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) + { + /* $&, ${^MATCH}, $1 ... */ + i = t1 - s1; + s = rx->subbeg + s1 - rx->suboffset; + } else { + goto ret_undef; + } + + assert(s >= rx->subbeg); + assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) ); + if (i >= 0) { +#ifdef NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; + TAINT_NOT; + sv_setpvn(sv, s, i); + TAINT_set(oldtainted); +#endif + if (RXp_MATCH_UTF8(rx)) + SvUTF8_on(sv); + else + SvUTF8_off(sv); + if (TAINTING_get) { + if (RXp_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + TAINT; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); + } + } else { + TAINT; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); + } + } else { + ret_undef: + sv_set_undef(sv); + return; + } +} + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE; + + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak_no_modify(); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, + const I32 paren) +{ + struct regexp *const rx = ReANY(r); + I32 i; + I32 s1, t1; + + PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; + + if ( paren == RX_BUFF_IDX_CARET_PREMATCH + || paren == RX_BUFF_IDX_CARET_FULLMATCH + || paren == RX_BUFF_IDX_CARET_POSTMATCH + ) + { + bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY); + if (!keepcopy) { + /* on something like + * $r = qr/.../; + * /$qr/p; + * the KEEPCOPY is set on the PMOP rather than the regex */ + if (PL_curpm && r == PM_GETRE(PL_curpm)) + keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY); + } + if (!keepcopy) + goto warn_undef; + } + + /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */ + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: /* $` */ + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: /* $' */ + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + + default: /* $& / ${^MATCH}, $1, $2, ... */ + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + warn_undef: + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((const SV *)sv); + return 0; + } + } + getlen: + if (i > 0 && RXp_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg - rx->suboffset + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + /* * ex: set ts=8 sts=4 sw=4 et: */ @@ -30,7 +30,7 @@ struct regnode_meta { }; struct regnode { - U8 flags; + U8 flags; U8 type; U16 next_off; }; @@ -47,13 +47,13 @@ struct regexp; struct reg_substr_datum { SSize_t min_offset; /* min pos (in chars) that substr must appear */ SSize_t max_offset; /* max pos (in chars) that substr must appear */ - SV *substr; /* non-utf8 variant */ - SV *utf8_substr; /* utf8 variant */ + SV *substr; /* non-utf8 variant */ + SV *utf8_substr; /* utf8 variant */ SSize_t end_shift; /* how many fixed chars must end the string */ }; struct reg_substr_data { U8 check_ix; /* index into data[] of check substr */ - struct reg_substr_datum data[3]; /* Actual array */ + struct reg_substr_datum data[3]; /* Actual array */ }; # ifdef PERL_ANY_COW @@ -70,13 +70,13 @@ typedef struct regexp_paren_pair { /* 'start_tmp' records a new opening position before the matching end * has been found, so that the old start and end values are still * valid, e.g. - * "abc" =~ /(.(?{print "[$1]"}))+/ + * "abc" =~ /(.(?{print "[$1]"}))+/ *outputs [][a][b] * This field is not part of the API. */ SSize_t start_tmp; } regexp_paren_pair; -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_UTF8_C) +# if defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_UTF8_C) # define _invlist_union(a, b, output) _invlist_union_maybe_complement_2nd(a, b, FALSE, output) # define _invlist_intersection(a, b, output) _invlist_intersection_maybe_complement_2nd(a, b, FALSE, output) @@ -174,13 +174,13 @@ typedef struct regexp { } regexp; -# define RXp_PAREN_NAMES(rx) ((rx)->paren_names) +# define RXp_PAREN_NAMES(rx) ((rx)->paren_names) /* used for high speed searches */ typedef struct re_scream_pos_data_s { - char **scream_olds; /* match pos */ - SSize_t *scream_pos; /* Internal iterator of scream. */ + char **scream_olds; /* match pos */ + SSize_t *scream_pos; /* Internal iterator of scream. */ } re_scream_pos_data; /* regexp_engine structure. This is the dispatch table for regexes. @@ -302,7 +302,7 @@ and check for NULL. # include "op_reg_common.h" -# define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_NOCAPTURE) +# define RXf_PMf_STD_PMMOD (RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_FOLD|RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE|RXf_PMf_NOCAPTURE) # define CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, x_count) \ case IGNORE_PAT_MOD: *(pmfl) |= RXf_PMf_FOLD; break; \ @@ -372,7 +372,7 @@ and check for NULL. # define INT_PAT_MODS STD_PAT_MODS KEEPCOPY_PAT_MODS # define EXT_PAT_MODS ONCE_PAT_MODS KEEPCOPY_PAT_MODS NOCAPTURE_PAT_MODS -# define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS CHARSET_PAT_MODS +# define QR_PAT_MODS STD_PAT_MODS EXT_PAT_MODS CHARSET_PAT_MODS # define M_PAT_MODS QR_PAT_MODS LOOP_PAT_MODS # define S_PAT_MODS M_PAT_MODS EXEC_PAT_MODS NONDESTRUCT_PAT_MODS @@ -425,38 +425,38 @@ and check for NULL. /* What we have seen */ # define RXf_NO_INPLACE_SUBST (1U<<(RXf_BASE_SHIFT+2)) -# define RXf_EVAL_SEEN (1U<<(RXf_BASE_SHIFT+3)) +# define RXf_EVAL_SEEN (1U<<(RXf_BASE_SHIFT+3)) /* Special */ # define RXf_UNBOUNDED_QUANTIFIER_SEEN (1U<<(RXf_BASE_SHIFT+4)) -# define RXf_CHECK_ALL (1U<<(RXf_BASE_SHIFT+5)) +# define RXf_CHECK_ALL (1U<<(RXf_BASE_SHIFT+5)) /* UTF8 related */ -# define RXf_MATCH_UTF8 (1U<<(RXf_BASE_SHIFT+6)) /* $1 etc are utf8 */ +# define RXf_MATCH_UTF8 (1U<<(RXf_BASE_SHIFT+6)) /* $1 etc are utf8 */ /* Intuit related */ -# define RXf_USE_INTUIT_NOML (1U<<(RXf_BASE_SHIFT+7)) -# define RXf_USE_INTUIT_ML (1U<<(RXf_BASE_SHIFT+8)) -# define RXf_INTUIT_TAIL (1U<<(RXf_BASE_SHIFT+9)) +# define RXf_USE_INTUIT_NOML (1U<<(RXf_BASE_SHIFT+7)) +# define RXf_USE_INTUIT_ML (1U<<(RXf_BASE_SHIFT+8)) +# define RXf_INTUIT_TAIL (1U<<(RXf_BASE_SHIFT+9)) # define RXf_USE_INTUIT (RXf_USE_INTUIT_NOML|RXf_USE_INTUIT_ML) /* Do we have some sort of anchor? */ # define RXf_IS_ANCHORED (1U<<(RXf_BASE_SHIFT+10)) /* Copy and tainted info */ -# define RXf_COPY_DONE (1U<<(RXf_BASE_SHIFT+11)) +# define RXf_COPY_DONE (1U<<(RXf_BASE_SHIFT+11)) /* post-execution: $1 et al are tainted */ -# define RXf_TAINTED_SEEN (1U<<(RXf_BASE_SHIFT+12)) +# define RXf_TAINTED_SEEN (1U<<(RXf_BASE_SHIFT+12)) /* this pattern was tainted during compilation */ -# define RXf_TAINTED (1U<<(RXf_BASE_SHIFT+13)) +# define RXf_TAINTED (1U<<(RXf_BASE_SHIFT+13)) /* Flags indicating special patterns */ # define RXf_START_ONLY (1U<<(RXf_BASE_SHIFT+14)) /* Pattern is /^/ */ # define RXf_SKIPWHITE (1U<<(RXf_BASE_SHIFT+15)) /* Pattern is for a */ /* split " " */ -# define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */ -# define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */ +# define RXf_WHITE (1U<<(RXf_BASE_SHIFT+16)) /* Pattern is /\s+/ */ +# define RXf_NULL (1U<<(RXf_BASE_SHIFT+17)) /* Pattern is // */ /* See comments at the beginning of these defines about adding bits. The * highest bit position should be used, so that if RXf_BASE_SHIFT gets @@ -619,35 +619,35 @@ and check for NULL. on second iteration */ #if defined(PERL_USE_GCC_BRACE_GROUPS) -# define ReREFCNT_inc(re) \ - ({ \ - /* This is here to generate a casting warning if incorrect. */ \ - REGEXP *const _rerefcnt_inc = (re); \ - assert(SvTYPE(_rerefcnt_inc) == SVt_REGEXP); \ - SvREFCNT_inc(_rerefcnt_inc); \ - _rerefcnt_inc; \ +# define ReREFCNT_inc(re) \ + ({ \ + /* This is here to generate a casting warning if incorrect. */ \ + REGEXP *const _rerefcnt_inc = (re); \ + assert(SvTYPE(_rerefcnt_inc) == SVt_REGEXP); \ + SvREFCNT_inc(_rerefcnt_inc); \ + _rerefcnt_inc; \ }) -# define ReREFCNT_dec(re) \ - ({ \ - /* This is here to generate a casting warning if incorrect. */ \ - REGEXP *const _rerefcnt_dec = (re); \ - SvREFCNT_dec(_rerefcnt_dec); \ +# define ReREFCNT_dec(re) \ + ({ \ + /* This is here to generate a casting warning if incorrect. */ \ + REGEXP *const _rerefcnt_dec = (re); \ + SvREFCNT_dec(_rerefcnt_dec); \ }) #else -# define ReREFCNT_dec(re) SvREFCNT_dec(re) -# define ReREFCNT_inc(re) ((REGEXP *) SvREFCNT_inc(re)) +# define ReREFCNT_dec(re) SvREFCNT_dec(re) +# define ReREFCNT_inc(re) ((REGEXP *) SvREFCNT_inc(re)) #endif -#define ReANY(re) Perl_ReANY((const REGEXP *)(re)) +#define ReANY(re) Perl_ReANY((const REGEXP *)(re)) /* FIXME for plugins. */ -#define FBMcf_TAIL_DOLLAR 1 -#define FBMcf_TAIL_DOLLARM 2 -#define FBMcf_TAIL_Z 4 -#define FBMcf_TAIL_z 8 -#define FBMcf_TAIL (FBMcf_TAIL_DOLLAR|FBMcf_TAIL_DOLLARM|FBMcf_TAIL_Z|FBMcf_TAIL_z) +#define FBMcf_TAIL_DOLLAR 1 +#define FBMcf_TAIL_DOLLARM 2 +#define FBMcf_TAIL_Z 4 +#define FBMcf_TAIL_z 8 +#define FBMcf_TAIL (FBMcf_TAIL_DOLLAR|FBMcf_TAIL_DOLLARM|FBMcf_TAIL_Z|FBMcf_TAIL_z) -#define FBMrf_MULTILINE 1 +#define FBMrf_MULTILINE 1 struct regmatch_state; struct regmatch_slab; @@ -681,7 +681,7 @@ typedef struct { regmatch_info_aux_eval *info_aux_eval; struct regmatch_state *old_regmatch_state; /* saved PL_regmatch_state */ struct regmatch_slab *old_regmatch_slab; /* saved PL_regmatch_slab */ - char *poscache; /* S-L cache of fail positions of WHILEMs */ + char *poscache; /* S-L cache of fail positions of WHILEMs */ } regmatch_info_aux; @@ -753,8 +753,8 @@ struct next_matchable_info { typedef I32 CHECKPOINT; typedef struct regmatch_state { - int resume_state; /* where to jump to on return */ - char *locinput; /* where to backtrack in string on failure */ + int resume_state; /* where to jump to on return */ + char *locinput; /* where to backtrack in string on failure */ char *loceol; U8 *sr0; /* position of start of script run, or NULL */ @@ -819,14 +819,14 @@ typedef struct regmatch_state { U32 lastcloseparen; CHECKPOINT cp; - U32 accepted; /* how many accepting states left */ - bool longfold;/* saw a fold with a 1->n char mapping */ + U32 accepted; /* how many accepting states left */ + bool longfold;/* saw a fold with a 1->n char mapping */ U16 *jump; /* positive offsets from me */ - regnode *me; /* Which node am I - needed for jump tries*/ - U8 *firstpos;/* pos in string of first trie match */ - U32 firstchars;/* len in chars of firstpos from start */ - U16 nextword;/* next word to try */ - U16 topword; /* longest accepted word */ + regnode *me; /* Which node am I - needed for jump tries*/ + U8 *firstpos;/* pos in string of first trie match */ + U32 firstchars;/* len in chars of firstpos from start */ + U16 nextword;/* next word to try */ + U16 topword; /* longest accepted word */ } trie; /* special types - these members are used to store state for special @@ -836,11 +836,11 @@ typedef struct regmatch_state { struct regmatch_state *prev_yes_state; struct regmatch_state *prev_curlyx; struct regmatch_state *prev_eval; - REGEXP *prev_rex; - CHECKPOINT cp; /* remember current savestack indexes */ - CHECKPOINT lastcp; + REGEXP *prev_rex; + CHECKPOINT cp; /* remember current savestack indexes */ + CHECKPOINT lastcp; U32 close_paren; /* which close bracket is our end (+1) */ - regnode *B; /* the node following us */ + regnode *B; /* the node following us */ char *prev_recurse_locinput; } eval; @@ -848,7 +848,7 @@ typedef struct regmatch_state { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; I32 wanted; - I32 logical; /* saved copy of 'logical' var */ + I32 logical; /* saved copy of 'logical' var */ U8 count; /* number of beginning positions */ char *start; char *end; @@ -874,26 +874,26 @@ typedef struct regmatch_state { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; struct regmatch_state *prev_curlyx; /* previous cur_curlyx */ - regnode *me; /* the CURLYX node */ - regnode *B; /* the B node in /A*B/ */ - CHECKPOINT cp; /* remember current savestack index */ - bool minmod; - int parenfloor;/* how far back to strip paren data */ + regnode *me; /* the CURLYX node */ + regnode *B; /* the B node in /A*B/ */ + CHECKPOINT cp; /* remember current savestack index */ + bool minmod; + int parenfloor;/* how far back to strip paren data */ /* these two are modified by WHILEM */ - int count; /* how many instances of A we've matched */ - char *lastloc;/* where previous A matched (0-len detect) */ + int count; /* how many instances of A we've matched */ + char *lastloc;/* where previous A matched (0-len detect) */ } curlyx; struct { /* this first element must match u.yes */ struct regmatch_state *prev_yes_state; struct regmatch_state *save_curlyx; - CHECKPOINT cp; /* remember current savestack indexes */ - CHECKPOINT lastcp; - char *save_lastloc; /* previous curlyx.lastloc */ - I32 cache_offset; - I32 cache_mask; + CHECKPOINT cp; /* remember current savestack indexes */ + CHECKPOINT lastcp; + char *save_lastloc; /* previous curlyx.lastloc */ + I32 cache_offset; + I32 cache_mask; } whilem; struct { @@ -902,11 +902,11 @@ typedef struct regmatch_state { CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; - I32 alen; /* length of first-matched A string */ + I32 alen; /* length of first-matched A string */ I32 count; bool minmod; - regnode *A, *B; /* the nodes corresponding to /A*B/ */ - regnode *me; /* the curlym node */ + regnode *A, *B; /* the nodes corresponding to /A*B/ */ + regnode *me; /* the curlym node */ struct next_matchable_info Binfo; } curlym; @@ -915,11 +915,11 @@ typedef struct regmatch_state { CHECKPOINT cp; U32 lastparen; U32 lastcloseparen; - char *maxpos; /* highest possible point in string to match */ - char *oldloc; /* the previous locinput */ + char *maxpos; /* highest possible point in string to match */ + char *oldloc; /* the previous locinput */ int count; - int min, max; /* {m,n} */ - regnode *A, *B; /* the nodes corresponding to /A*B/ */ + int min, max; /* {m,n} */ + regnode *A, *B; /* the nodes corresponding to /A*B/ */ struct next_matchable_info Binfo; } curly; /* and CURLYN/PLUS/STAR */ diff --git a/unicode_constants.h b/unicode_constants.h index 1b295e0dda..0f176f279a 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -103,7 +103,7 @@ bytes. # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "\xAB\xBB" # endif -# ifdef PERL_IN_REGCOMP_C +# ifdef PERL_IN_REGCOMP_ANY # define MAX_PRINT_A 0x7E /* The max code point that isPRINT_A */ # endif #endif /* ASCII/Latin1 */ @@ -164,7 +164,7 @@ bytes. # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "\x8A\x8B" # endif -# ifdef PERL_IN_REGCOMP_C +# ifdef PERL_IN_REGCOMP_ANY # define MAX_PRINT_A 0xF9 /* The max code point that isPRINT_A */ # endif #endif /* EBCDIC 1047 */ @@ -225,13 +225,13 @@ bytes. # define DEPRECATED_OPENING_NON_UTF8_BRACKETS "\x8A\x8B" # endif -# ifdef PERL_IN_REGCOMP_C +# ifdef PERL_IN_REGCOMP_ANY # define MAX_PRINT_A 0xF9 /* The max code point that isPRINT_A */ # endif #endif /* EBCDIC 037 */ /* The number of code points not matching \pC */ -#ifdef PERL_IN_REGCOMP_C +#ifdef PERL_IN_REGCOMP_ANY # define NON_OTHER_COUNT 149016 #endif diff --git a/vms/descrip_mms.template b/vms/descrip_mms.template index ba0d6ab6d3..e8e1c685f6 100644 --- a/vms/descrip_mms.template +++ b/vms/descrip_mms.template @@ -211,25 +211,27 @@ FULLLIBS2 = $(LIBS2)|$(THRLIBS1)|$(THRLIBS2) c0 = $(MALLOC_C) av.c builtin.c caretx.c deb.c doio.c doop.c dquote.c dump.c globals.c gv.c hv.c mro_core.c c1 = mg.c locale.c mathoms.c miniperlmain.c numeric.c op.c pad.c peep.c perl.c perlio.c -c2 = perly.c pp.c pp_ctl.c pp_hot.c pp_pack.c pp_sort.c pp_sys.c regcomp.c regexec.c reentr.c -c3 = run.c scope.c sv.c taint.c time64.c toke.c universal.c utf8.c util.c vms.c keywords.c -c = $(c0) $(c1) $(c2) $(c3) +c2 = perly.c pp.c pp_ctl.c pp_hot.c pp_pack.c pp_sort.c pp_sys.c regcomp.c regcomp_debug.c +c3 = regcomp_invlist.c regcomp_study.c regcomp_trie.c regexec.c reentr.c +c4 = run.c scope.c sv.c taint.c time64.c toke.c universal.c utf8.c util.c vms.c keywords.c +c = $(c0) $(c1) $(c2) $(c3) $(c4) obj0 = perl$(O) obj1 = $(MALLOC_O) av$(O) builtin$(O) caretx$(O) deb$(O) doio$(O) doop$(O) dquote$(O) dump$(O) mro_core$(O) globals$(O) gv$(O) hv$(O) obj2 = keywords$(O) locale$(O) mathoms$(O) mg$(O) miniperlmain$(O) numeric$(O) op$(O) pad$(O) peep$(O) perlio$(O) obj3 = perly$(O) pp$(O) pp_ctl$(O) pp_hot$(O) reentr$(O) pp_pack$(O) pp_sort$(O) pp_sys$(O) regcomp$(O) -obj4 = regexec$(O) run$(O) scope$(O) sv$(O) taint$(O) time64$(O) toke$(O) universal$(O) utf8$(O) util$(O) vms$(O) +obj4 = regcomp_debug$(O) regcomp_invlist$(O) regcomp_study$(O) regcomp_trie$(O) +obj5 = regexec$(O) run$(O) scope$(O) sv$(O) taint$(O) time64$(O) toke$(O) universal$(O) utf8$(O) util$(O) vms$(O) -mini_obj = perlmini$(O) $(obj1) $(obj2) $(obj3) $(obj4) -obj = $(obj0) $(obj1) $(obj2) $(obj3) $(obj4) +mini_obj = perlmini$(O) $(obj1) $(obj2) $(obj3) $(obj4) $(obj5) +obj = $(obj0) $(obj1) $(obj2) $(obj3) $(obj4) $(obj5) h0 = av.h config.h cop.h cv.h embed.h embedvar.h h1 = EXTERN.h form.h gv.h handy.h hv.h l1_char_class_tab.h INTERN.h intrpvar.h h2 = iperlsys.h keywords.h mydtrace.h mg.h mg_vtable.h nostdio.h op.h h3 = op_reg_common.h opcode.h opnames.h overload.h pad.h parser.h patchlevel.h h4 = perl.h perlapi.h perlio.h perlsdio.h perlvars.h perly.h -h5 = pp.h pp_proto.h proto.h regcomp.h regexp.h regnodes.h scope.h +h5 = pp.h pp_proto.h proto.h regcomp.h regcomp_internal.h regexp.h regnodes.h scope.h h6 = sv.h thread.h utf8.h util.h vmsish.h warnings.h xsub.h h = $(h0) $(h1) $(h2) $(h3) $(h4) $(h5) $(h6) @@ -672,6 +674,14 @@ reentr$(O) : reentr.c $(h) $(CC) $(CORECFLAGS) $(MMS$SOURCE) regcomp$(O) : regcomp.c $(h) $(CC) $(CORECFLAGS) $(MMS$SOURCE) +regcomp_debug$(O) : regcomp_debug.c $(h) + $(CC) $(CORECFLAGS) $(MMS$SOURCE) +regcomp_invlist$(O) : regcomp_invlist.c $(h) + $(CC) $(CORECFLAGS) $(MMS$SOURCE) +regcomp_study$(O) : regcomp_study.c $(h) + $(CC) $(CORECFLAGS) $(MMS$SOURCE) +regcomp_trie$(O) : regcomp_trie.c $(h) + $(CC) $(CORECFLAGS) $(MMS$SOURCE) regexec$(O) : regexec.c $(h) $(CC) $(CORECFLAGS) $(MMS$SOURCE) run$(O) : run.c $(h) diff --git a/win32/GNUmakefile b/win32/GNUmakefile index 6dd59e069e..7ba37332f3 100644 --- a/win32/GNUmakefile +++ b/win32/GNUmakefile @@ -941,6 +941,10 @@ NOOP = @rem MICROCORE_SRC = \ ..\toke.c \ ..\regcomp.c \ + ..\regcomp_trie.c \ + ..\regcomp_debug.c \ + ..\regcomp_invlist.c \ + ..\regcomp_study.c \ ..\regexec.c \ ..\op.c \ ..\sv.c \ @@ -1017,6 +1021,7 @@ CORE_NOCFG_H = \ ..\pp.h \ ..\proto.h \ ..\regcomp.h \ + ..\regcomp_internal.h \ ..\regexp.h \ ..\scope.h \ ..\sv.h \ @@ -1154,7 +1159,15 @@ ifeq ($(CCTYPE),) endif -..\regcomp$(o) : ..\regnodes.h ..\regcharclass.h +..\regcomp$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_debug$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_invlist$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_study$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_trie$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h ..\regexec$(o) : ..\regnodes.h ..\regcharclass.h @@ -1894,4 +1907,3 @@ nok: utils $(PERLEXE) $(PERLDLL) Extensions_nonxs Extensions nokfile: utils $(PERLEXE) $(PERLDLL) Extensions_nonxs Extensions $(PERLEXE) ..\utils\perlbug -nok -s "(UNINSTALLED)" -F perl.nok - diff --git a/win32/Makefile b/win32/Makefile index 4b347bde9f..b403767586 100644 --- a/win32/Makefile +++ b/win32/Makefile @@ -699,6 +699,10 @@ MICROCORE_SRC = \ ..\pp_sys.c \ ..\reentr.c \ ..\regcomp.c \ + ..\regcomp_trie.c \ + ..\regcomp_debug.c \ + ..\regcomp_invlist.c \ + ..\regcomp_study.c \ ..\regexec.c \ ..\run.c \ ..\scope.c \ @@ -747,6 +751,7 @@ CORE_NOCFG_H = \ ..\pp.h \ ..\proto.h \ ..\regcomp.h \ + ..\regcomp_internal.h \ ..\regexp.h \ ..\scope.h \ ..\sv.h \ @@ -848,7 +853,15 @@ all : ..\git_version.h $(GLOBEXE) $(CONFIGPM) \ regnodes : ..\regnodes.h -..\regcomp$(o) : ..\regnodes.h ..\regcharclass.h +..\regcomp$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_debug$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_invlist$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_study$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h + +..\regcomp_trie$(o) : ..\regcomp.h ..\regcomp_internal.h ..\regnodes.h ..\regcharclass.h ..\regexec$(o) : ..\regnodes.h ..\regcharclass.h |