summaryrefslogtreecommitdiff
path: root/regcomp_debug.c
diff options
context:
space:
mode:
authorYves Orton <demerphq@gmail.com>2022-12-09 11:00:17 +0100
committerYves Orton <demerphq@gmail.com>2022-12-09 16:19:29 +0100
commit85900e28cc250e1c4603f11073b77d0c6b5cff46 (patch)
treeacc41c05f436dd1063459753dda9b557f6261e6c /regcomp_debug.c
parent6a6e5d037dad0702bc219f8265505037e1772552 (diff)
downloadperl-85900e28cc250e1c4603f11073b77d0c6b5cff46.tar.gz
regcomp.c - decompose into smaller files
This splits a bunch of the subcomponents of the regex engine into smaller files. regcomp_debug.c regcomp_internal.h regcomp_invlist.c regcomp_study.c regcomp_trie.c The only real change besides to the build machine to achieve the split is to also adds some new defines which can be used in embed.fnc to control exports without having to enumerate /every/ regex engine file. For instance all of regcomp*.c defines PERL_IN_REGCOMP_ANY, and this is used in embed.fnc to manage exports.
Diffstat (limited to 'regcomp_debug.c')
-rw-r--r--regcomp_debug.c1625
1 files changed, 1625 insertions, 0 deletions
diff --git a/regcomp_debug.c b/regcomp_debug.c
new file mode 100644
index 0000000000..c7c42f6941
--- /dev/null
+++ b/regcomp_debug.c
@@ -0,0 +1,1625 @@
+#ifdef PERL_EXT_RE_BUILD
+#include "re_top.h"
+#endif
+
+#include "EXTERN.h"
+#define PERL_IN_REGEX_ENGINE
+#define PERL_IN_REGCOMP_ANY
+#define PERL_IN_REGCOMP_DEBUG_C
+#include "perl.h"
+
+#ifdef PERL_IN_XSUB_RE
+# include "re_comp.h"
+#else
+# include "regcomp.h"
+#endif
+
+#include "invlist_inline.h"
+#include "unicode_constants.h"
+#include "regcomp_internal.h"
+
+#ifdef DEBUGGING
+
+int
+Perl_re_printf(pTHX_ const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_PRINTF;
+ va_start(ap, fmt);
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+
+int
+Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_INDENTF;
+ va_start(ap, depth);
+ PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+
+void
+Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
+ const char *close_str)
+{
+ PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
+ if (!flags)
+ return;
+
+ Perl_re_printf( aTHX_ "%s", open_str);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
+ Perl_re_printf( aTHX_ "%s", close_str);
+}
+
+void
+Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
+ U32 depth, int is_inf,
+ SSize_t min, SSize_t stopmin, SSize_t delta)
+{
+ PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ DEBUG_OPTIMISE_MORE_r({
+ if (!data)
+ return;
+ Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ depth,
+ where,
+ min, stopmin, delta,
+ (IV)data->pos_min,
+ (IV)data->pos_delta,
+ (UV)data->flags
+ );
+
+ Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
+
+ Perl_re_printf( aTHX_
+ " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
+ (IV)data->whilem_c,
+ (IV)(data->last_closep ? *((data)->last_closep) : -1),
+ is_inf ? "INF " : ""
+ );
+
+ if (data->last_found) {
+ int i;
+ Perl_re_printf(aTHX_
+ "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
+ SvPVX_const(data->last_found),
+ (IV)data->last_end,
+ (IV)data->last_start_min,
+ (IV)data->last_start_max
+ );
+
+ for (i = 0; i < 2; i++) {
+ Perl_re_printf(aTHX_
+ " %s%s: '%s' @ %" IVdf "/%" IVdf,
+ data->cur_is_floating == i ? "*" : "",
+ i ? "Float" : "Fixed",
+ SvPVX_const(data->substrs[i].str),
+ (IV)data->substrs[i].min_offset,
+ (IV)data->substrs[i].max_offset
+ );
+ Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
+ }
+ }
+
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+void
+Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
+ regnode *scan, U32 depth, U32 flags)
+{
+ PERL_ARGS_ASSERT_DEBUG_PEEP;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ DEBUG_OPTIMISE_r({
+ regnode *Next;
+
+ if (!scan)
+ return;
+ Next = regnext(scan);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
+ Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
+ depth,
+ str,
+ REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
+ Next ? (REG_NODE_NUM(Next)) : 0 );
+ Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+#endif /* DEBUGGING */
+
+/*
+ - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
+ */
+#ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+
+ ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ Perl_re_printf( aTHX_ "\n");
+ else
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
+ }
+}
+
+static void
+S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+ regex_charset cs;
+
+ ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
+ if (flags & (1U<<bit)) {
+ if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
+ continue;
+ }
+ if (!set++ && lead)
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
+ }
+ }
+ if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+ if (!set++ && lead) {
+ Perl_re_printf( aTHX_ "%s", lead);
+ }
+ switch (cs) {
+ case REGEX_UNICODE_CHARSET:
+ Perl_re_printf( aTHX_ "UNICODE");
+ break;
+ case REGEX_LOCALE_CHARSET:
+ Perl_re_printf( aTHX_ "LOCALE");
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
+ break;
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
+ break;
+ default:
+ Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
+ break;
+ }
+ }
+ if (lead) {
+ if (set)
+ Perl_re_printf( aTHX_ "\n");
+ else
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
+ }
+}
+#endif
+
+void
+Perl_regdump(pTHX_ const regexp *r)
+{
+#ifdef DEBUGGING
+ int i;
+ SV * const sv = sv_newmortal();
+ SV *dsv= sv_newmortal();
+ RXi_GET_DECL(r, ri);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_REGDUMP;
+
+ (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
+
+ /* Header fields of interest. */
+ for (i = 0; i < 2; i++) {
+ if (r->substrs->data[i].substr) {
+ RE_PV_QUOTED_DECL(s, 0, dsv,
+ SvPVX_const(r->substrs->data[i].substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].substr),
+ PL_dump_re_max_len);
+ Perl_re_printf( aTHX_
+ "%s %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ else if (r->substrs->data[i].utf8_substr) {
+ RE_PV_QUOTED_DECL(s, 1, dsv,
+ SvPVX_const(r->substrs->data[i].utf8_substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].utf8_substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ }
+
+ if (r->check_substr || r->check_utf8)
+ Perl_re_printf( aTHX_
+ (const char *)
+ ( r->check_substr == r->substrs->data[1].substr
+ && r->check_utf8 == r->substrs->data[1].utf8_substr
+ ? "(checking floating" : "(checking anchored"));
+ if (r->intflags & PREGf_NOSCAN)
+ Perl_re_printf( aTHX_ " noscan");
+ if (r->extflags & RXf_CHECK_ALL)
+ Perl_re_printf( aTHX_ " isall");
+ if (r->check_substr || r->check_utf8)
+ Perl_re_printf( aTHX_ ") ");
+
+ if (ri->regstclass) {
+ regprop(r, sv, ri->regstclass, NULL, NULL);
+ Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
+ }
+ if (r->intflags & PREGf_ANCH) {
+ Perl_re_printf( aTHX_ "anchored");
+ if (r->intflags & PREGf_ANCH_MBOL)
+ Perl_re_printf( aTHX_ "(MBOL)");
+ if (r->intflags & PREGf_ANCH_SBOL)
+ Perl_re_printf( aTHX_ "(SBOL)");
+ if (r->intflags & PREGf_ANCH_GPOS)
+ Perl_re_printf( aTHX_ "(GPOS)");
+ Perl_re_printf( aTHX_ " ");
+ }
+ if (r->intflags & PREGf_GPOS_SEEN)
+ Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
+ if (r->intflags & PREGf_SKIP)
+ Perl_re_printf( aTHX_ "plus ");
+ if (r->intflags & PREGf_IMPLICIT)
+ Perl_re_printf( aTHX_ "implicit ");
+ Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
+ if (r->extflags & RXf_EVAL_SEEN)
+ Perl_re_printf( aTHX_ "with eval ");
+ Perl_re_printf( aTHX_ "\n");
+ DEBUG_FLAGS_r({
+ regdump_extflags("r->extflags: ", r->extflags);
+ regdump_intflags("r->intflags: ", r->intflags);
+ });
+#else
+ PERL_ARGS_ASSERT_REGDUMP;
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(r);
+#endif /* DEBUGGING */
+}
+
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
+ || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
+ || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
+ || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
+ || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
+ || CC_VERTSPACE_ != 15
+# error Need to adjust order of anyofs[]
+# endif
+static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:cased:]",
+ "[:^cased:]",
+ "\\s",
+ "\\S",
+ "[:blank:]",
+ "[:^blank:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "\\v",
+ "\\V"
+};
+#endif
+
+/*
+- regprop - printable representation of opcode, with run time support
+*/
+
+void
+Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
+{
+#ifdef DEBUGGING
+ U8 k;
+ const U8 op = OP(o);
+ RXi_GET_DECL(prog, progi);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_REGPROP;
+
+ SvPVCLEAR(sv);
+
+ if (op > REGNODE_MAX) { /* regnode.type is unsigned */
+ if (pRExC_state) { /* This gives more info, if we have it */
+ FAIL3("panic: corrupted regexp opcode %d > %d",
+ (int)op, (int)REGNODE_MAX);
+ }
+ else {
+ Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+ (int)op, (int)REGNODE_MAX);
+ }
+ }
+ sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
+
+ k = REGNODE_TYPE(op);
+
+ if (k == EXACT) {
+ sv_catpvs(sv, " ");
+ /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
+ * is a crude hack but it may be the best for now since
+ * we have no flag "this EXACTish node was UTF-8"
+ * --jhi */
+ pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
+ PERL_PV_ESCAPE_UNI_DETECT |
+ PERL_PV_ESCAPE_NONASCII |
+ PERL_PV_PRETTY_ELLIPSES |
+ PERL_PV_PRETTY_LTGT |
+ PERL_PV_PRETTY_NOCLEAR
+ );
+ } else if (k == TRIE) {
+ /* print the details of the trie in dumpuntil instead, as
+ * progi->data isn't available here */
+ const U32 n = ARG(o);
+ const reg_ac_data * const ac = IS_TRIE_AC(op) ?
+ (reg_ac_data *)progi->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie
+ = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
+
+ Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags));
+ DEBUG_TRIE_COMPILE_r({
+ if (trie->jump)
+ sv_catpvs(sv, "(JUMP)");
+ Perl_sv_catpvf(aTHX_ sv,
+ "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
+ (UV)trie->startstate,
+ (IV)trie->statecount-1, /* -1 because of the unused 0 element */
+ (UV)trie->wordcount,
+ (UV)trie->minlen,
+ (UV)trie->maxlen,
+ (UV)TRIE_CHARCOUNT(trie),
+ (UV)trie->uniquecharcount
+ );
+ });
+ if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
+ sv_catpvs(sv, "[");
+ (void) put_charclass_bitmap_innards(sv,
+ ((IS_ANYOF_TRIE(op))
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie)),
+ NULL,
+ NULL,
+ NULL,
+ 0,
+ FALSE
+ );
+ sv_catpvs(sv, "]");
+ }
+ } else if (k == CURLY) {
+ U32 lo = ARG1(o), hi = ARG2(o);
+ if (op == CURLYM || op == CURLYN || op == CURLYX)
+ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
+ Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
+ if (hi == REG_INFTY)
+ sv_catpvs(sv, "INFTY");
+ else
+ Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
+ sv_catpvs(sv, "}");
+ }
+ else if (k == WHILEM && o->flags) /* Ordinal/of */
+ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
+ else if (k == REF || k == OPEN || k == CLOSE
+ || k == GROUPP || op == ACCEPT)
+ {
+ AV *name_list= NULL;
+ U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) :
+ (op == OPEN || op == CLOSE) ? (U32)PARNO(o) :
+ (U32)ARG(o);
+ Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
+ if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+ if ( name_list ) {
+ if ( k != REF || (op < REFN)) {
+ SV **name= av_fetch_simple(name_list, parno, 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ else
+ if (parno > 0) {
+ /* parno must always be larger than 0 for this block
+ * as it represents a slot into the data array, which
+ * has the 0 slot reserved for a placeholder so any valid
+ * index into it is always true, eg non-zero
+ * see the '%' "what" type and the implementation of
+ * S_reg_add_data()
+ */
+ SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
+ I32 *nums=(I32*)SvPVX(sv_dat);
+ SV **name= av_fetch_simple(name_list, nums[0], 0 );
+ I32 n;
+ if (name) {
+ for ( n=0; n<SvIVX(sv_dat); n++ ) {
+ Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
+ (n ? "," : ""), (IV)nums[n]);
+ }
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ }
+ }
+ if ( k == REF && reginfo) {
+ U32 n = ARG(o); /* which paren pair */
+ I32 ln = prog->offs[n].start;
+ if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
+ Perl_sv_catpvf(aTHX_ sv, ": FAIL");
+ else if (ln == prog->offs[n].end)
+ Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
+ else {
+ const char *s = reginfo->strbeg + ln;
+ Perl_sv_catpvf(aTHX_ sv, ": ");
+ Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
+ PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
+ }
+ }
+ } else if (k == GOSUB) {
+ AV *name_list= NULL;
+ if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+
+ /* Paren and offset */
+ Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
+ (int)((o + (int)ARG2L(o)) - progi->program) );
+ if (name_list) {
+ SV **name= av_fetch_simple(name_list, ARG(o), 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ }
+ else if (k == LOGICAL)
+ /* 2: embedded, otherwise 1 */
+ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
+ else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
+ U8 flags;
+ char * bitmap;
+ U8 do_sep = 0; /* Do we need to separate various components of the
+ output? */
+ /* Set if there is still an unresolved user-defined property */
+ SV *unresolved = NULL;
+
+ /* Things that are ignored except when the runtime locale is UTF-8 */
+ SV *only_utf8_locale_invlist = NULL;
+
+ /* Code points that don't fit in the bitmap */
+ SV *nonbitmap_invlist = NULL;
+
+ /* And things that aren't in the bitmap, but are small enough to be */
+ SV* bitmap_range_not_in_bitmap = NULL;
+
+ bool inverted;
+
+ if (k != ANYOF) {
+ flags = 0;
+ bitmap = NULL;
+ }
+ else {
+ flags = ANYOF_FLAGS(o);
+ bitmap = ANYOF_BITMAP(o);
+ }
+
+ if (op == ANYOFL || op == ANYOFPOSIXL) {
+ if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
+ sv_catpvs(sv, "{utf8-locale-reqd}");
+ }
+ if (flags & ANYOFL_FOLD) {
+ sv_catpvs(sv, "{i}");
+ }
+ }
+
+ inverted = flags & ANYOF_INVERT;
+
+ /* If there is stuff outside the bitmap, get it */
+ if (k == ANYOFR) {
+
+ /* For a single range, split into the parts inside vs outside the
+ * bitmap. */
+ UV start = ANYOFRbase(o);
+ UV end = ANYOFRbase(o) + ANYOFRdelta(o);
+
+ if (start < NUM_ANYOF_CODE_POINTS) {
+ if (end < NUM_ANYOF_CODE_POINTS) {
+ bitmap_range_not_in_bitmap
+ = _add_range_to_invlist(bitmap_range_not_in_bitmap,
+ start, end);
+ }
+ else {
+ bitmap_range_not_in_bitmap
+ = _add_range_to_invlist(bitmap_range_not_in_bitmap,
+ start, NUM_ANYOF_CODE_POINTS);
+ start = NUM_ANYOF_CODE_POINTS;
+ }
+ }
+
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ ANYOFRbase(o),
+ ANYOFRbase(o) + ANYOFRdelta(o));
+ }
+ }
+ else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ NUM_ANYOF_CODE_POINTS,
+ UV_MAX);
+ }
+ else if (ANYOF_HAS_AUX(o)) {
+ (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
+ &unresolved,
+ &only_utf8_locale_invlist,
+ &nonbitmap_invlist);
+
+ /* The aux data may contain stuff that could fit in the bitmap.
+ * This could come from a user-defined property being finally
+ * resolved when this call was done; or much more likely because
+ * there are matches that require UTF-8 to be valid, and so aren't
+ * in the bitmap (or ANYOFR). This is teased apart later */
+ _invlist_intersection(nonbitmap_invlist,
+ PL_InBitmap,
+ &bitmap_range_not_in_bitmap);
+ /* Leave just the things that don't fit into the bitmap */
+ _invlist_subtract(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
+
+ /* Ready to start outputting. First, the initial left bracket */
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ if ( bitmap
+ || bitmap_range_not_in_bitmap
+ || only_utf8_locale_invlist
+ || unresolved)
+ {
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(
+ sv,
+ bitmap,
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o,
+ flags,
+
+ /* Can't try inverting for a
+ * better display if there
+ * are things that haven't
+ * been resolved */
+ (unresolved != NULL || k == ANYOFR));
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined
+ * yet, output them. If the result is not to be inverted, it is
+ * clearest to output them in a separate [] from the bitmap range
+ * stuff. If the result is to be complemented, we have to show
+ * everything in one [], as the inversion applies to the whole
+ * thing. Use {braces} to separate them from anything in the
+ * bitmap and anything above the bitmap. */
+ if (unresolved) {
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap
+ */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
+ }
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
+ PL_colors[0]);
+ }
+ sv_catsv(sv, unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
+ }
+ else if ( do_sep == 2
+ && ! nonbitmap_invlist
+ && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
+ {
+ /* Here, the display shows the class as inverted, and
+ * everything above the lower display should also match, but
+ * there is no indication of that. Add this range so the code
+ * below will add it to the display */
+ _invlist_union_complement_2nd(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
+ }
+
+ /* And, finally, add the above-the-bitmap stuff */
+ if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
+ SV* contents;
+
+ /* See if truncation size is overridden */
+ const STRLEN dump_len = (PL_dump_re_max_len > 256)
+ ? PL_dump_re_max_len
+ : 256;
+
+ /* This is output in a separate [] */
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
+ }
+
+ /* And, for easy of understanding, it is shown in the
+ * uncomplemented form if possible. The one exception being if
+ * there are unresolved items, where the inversion has to be
+ * delayed until runtime */
+ if (inverted && ! unresolved) {
+ _invlist_invert(nonbitmap_invlist);
+ _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
+ }
+
+ contents = invlist_contents(nonbitmap_invlist,
+ FALSE /* output suitable for catsv */
+ );
+
+ /* If the output is shorter than the permissible maximum, just do it. */
+ if (SvCUR(contents) <= dump_len) {
+ sv_catsv(sv, contents);
+ }
+ else {
+ const char * contents_string = SvPVX(contents);
+ STRLEN i = dump_len;
+
+ /* Otherwise, start at the permissible max and work back to the
+ * first break possibility */
+ while (i > 0 && contents_string[i] != ' ') {
+ i--;
+ }
+ if (i == 0) { /* Fail-safe. Use the max if we couldn't
+ find a legal break */
+ i = dump_len;
+ }
+
+ sv_catpvn(sv, contents_string, i);
+ sv_catpvs(sv, "...");
+ }
+
+ SvREFCNT_dec_NN(contents);
+ SvREFCNT_dec_NN(nonbitmap_invlist);
+ }
+
+ /* And finally the matching, closing ']' */
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ if (op == ANYOFHs) {
+ Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+ }
+ else if (REGNODE_TYPE(op) != ANYOF) {
+ U8 lowest = (op != ANYOFHr)
+ ? FLAGS(o)
+ : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+ U8 highest = (op == ANYOFHr)
+ ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+ : (op == ANYOFH || op == ANYOFR)
+ ? 0xFF
+ : lowest;
+#ifndef EBCDIC
+ if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+ {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
+ }
+ }
+
+ SvREFCNT_dec(unresolved);
+ }
+ else if (k == ANYOFM) {
+ SV * cp_list = get_ANYOFM_contents(o);
+
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ if (op == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
+
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
+ else if (k == ANYOFHbbm) {
+ SV * cp_list = get_ANYOFHbbm_contents(o);
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ sv_catsv(sv, invlist_contents(cp_list,
+ FALSE /* output suitable for catsv */
+ ));
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
+ else if (k == POSIXD || k == NPOSIXD) {
+ U8 index = FLAGS(o) * 2;
+ if (index < C_ARRAY_LENGTH(anyofs)) {
+ if (*anyofs[index] != '[') {
+ sv_catpvs(sv, "[");
+ }
+ sv_catpv(sv, anyofs[index]);
+ if (*anyofs[index] != '[') {
+ sv_catpvs(sv, "]");
+ }
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
+ }
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}",
+ "{lb}",
+ "{sb}",
+ "{wb}"
+ };
+ assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
+ else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
+ Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+ if (o->next_off) {
+ Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+ }
+ Perl_sv_catpvf(aTHX_ sv, "]");
+ }
+ else if (op == SBOL)
+ Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+ /* add on the verb argument if there is one */
+ if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) {
+ if ( ARG(o) )
+ Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
+ SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ else
+ sv_catpvs(sv, ":NULL");
+ }
+#else
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(sv);
+ PERL_UNUSED_ARG(o);
+ PERL_UNUSED_ARG(prog);
+ PERL_UNUSED_ARG(reginfo);
+ PERL_UNUSED_ARG(pRExC_state);
+#endif /* DEBUGGING */
+}
+
+#ifdef DEBUGGING
+
+STATIC void
+S_put_code_point(pTHX_ SV *sv, UV c)
+{
+ PERL_ARGS_ASSERT_PUT_CODE_POINT;
+
+ if (c > 255) {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
+ }
+ else if (isPRINT(c)) {
+ const char string = (char) c;
+
+ /* We use {phrase} as metanotation in the class, so also escape literal
+ * braces */
+ if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
+ sv_catpvs(sv, "\\");
+ sv_catpvn(sv, &string, 1);
+ }
+ else if (isMNEMONIC_CNTRL(c)) {
+ Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
+ }
+}
+
+STATIC void
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
+{
+ /* Appends to 'sv' a displayable version of the range of code points from
+ * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
+ * that have them, when they occur at the beginning or end of the range.
+ * It uses hex to output the remaining code points, unless 'allow_literals'
+ * is true, in which case the printable ASCII ones are output as-is (though
+ * some of these will be escaped by put_code_point()).
+ *
+ * NOTE: This is designed only for printing ranges of code points that fit
+ * inside an ANYOF bitmap. Higher code points are simply suppressed
+ */
+
+ const unsigned int min_range_count = 3;
+
+ assert(start <= end);
+
+ PERL_ARGS_ASSERT_PUT_RANGE;
+
+ while (start <= end) {
+ UV this_end;
+ const char * format;
+
+ if ( end - start < min_range_count
+ && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
+ {
+ /* Output a range of 1 or 2 chars individually, or longer ranges
+ * when printable */
+ for (; start <= end; start++) {
+ put_code_point(sv, start);
+ }
+ break;
+ }
+
+ /* If permitted by the input options, and there is a possibility that
+ * this range contains a printable literal, look to see if there is
+ * one. */
+ if (allow_literals && start <= MAX_PRINT_A) {
+
+ /* If the character at the beginning of the range isn't an ASCII
+ * printable, effectively split the range into two parts:
+ * 1) the portion before the first such printable,
+ * 2) the rest
+ * and output them separately. */
+ if (! isPRINT_A(start)) {
+ UV temp_end = start + 1;
+
+ /* There is no point looking beyond the final possible
+ * printable, in MAX_PRINT_A */
+ UV max = MIN(end, MAX_PRINT_A);
+
+ while (temp_end <= max && ! isPRINT_A(temp_end)) {
+ temp_end++;
+ }
+
+ /* Here, temp_end points to one beyond the first printable if
+ * found, or to one beyond 'max' if not. If none found, make
+ * sure that we use the entire range */
+ if (temp_end > MAX_PRINT_A) {
+ temp_end = end + 1;
+ }
+
+ /* Output the first part of the split range: the part that
+ * doesn't have printables, with the parameter set to not look
+ * for literals (otherwise we would infinitely recurse) */
+ put_range(sv, start, temp_end - 1, FALSE);
+
+ /* The 2nd part of the range (if any) starts here. */
+ start = temp_end;
+
+ /* We do a continue, instead of dropping down, because even if
+ * the 2nd part is non-empty, it could be so short that we want
+ * to output it as individual characters, as tested for at the
+ * top of this loop. */
+ continue;
+ }
+
+ /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
+ * output a sub-range of just the digits or letters, then process
+ * the remaining portion as usual. */
+ if (isALPHANUMERIC_A(start)) {
+ UV mask = (isDIGIT_A(start))
+ ? CC_DIGIT_
+ : isUPPER_A(start)
+ ? CC_UPPER_
+ : CC_LOWER_;
+ UV temp_end = start + 1;
+
+ /* Find the end of the sub-range that includes just the
+ * characters in the same class as the first character in it */
+ while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
+ temp_end++;
+ }
+ temp_end--;
+
+ /* For short ranges, don't duplicate the code above to output
+ * them; just call recursively */
+ if (temp_end - start < min_range_count) {
+ put_range(sv, start, temp_end, FALSE);
+ }
+ else { /* Output as a range */
+ put_code_point(sv, start);
+ sv_catpvs(sv, "-");
+ put_code_point(sv, temp_end);
+ }
+ start = temp_end + 1;
+ continue;
+ }
+
+ /* We output any other printables as individual characters */
+ if (isPUNCT_A(start) || isSPACE_A(start)) {
+ while (start <= end && (isPUNCT_A(start)
+ || isSPACE_A(start)))
+ {
+ put_code_point(sv, start);
+ start++;
+ }
+ continue;
+ }
+ } /* End of looking for literals */
+
+ /* Here is not to output as a literal. Some control characters have
+ * mnemonic names. Split off any of those at the beginning and end of
+ * the range to print mnemonically. It isn't possible for many of
+ * these to be in a row, so this won't overwhelm with output */
+ if ( start <= end
+ && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
+ {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+
+ /* If this didn't take care of the whole range ... */
+ if (start <= end) {
+
+ /* Look backwards from the end to find the final non-mnemonic
+ * */
+ UV temp_end = end;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the interior range that doesn't start
+ * or end with mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+ break;
+ }
+ }
+
+ /* As a final resort, output the range or subrange as hex. */
+
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ this_end = end;
+ }
+ else { /* Have to split range at the bitmap boundary */
+ this_end = (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
+ }
+#if NUM_ANYOF_CODE_POINTS > 256
+ format = (this_end < 256)
+ ? "\\x%02" UVXf "-\\x%02" UVXf
+ : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
+#else
+ format = "\\x%02" UVXf "-\\x%02" UVXf;
+#endif
+ GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
+ Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
+ GCC_DIAG_RESTORE_STMT;
+ break;
+ }
+}
+
+STATIC void
+S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
+{
+ /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
+ * 'invlist' */
+
+ UV start, end;
+ bool allow_literals = TRUE;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
+
+ /* Generally, it is more readable if printable characters are output as
+ * literals, but if a range (nearly) spans all of them, it's best to output
+ * it as a single range. This code will use a single range if all but 2
+ * ASCII printables are in it */
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
+
+ /* If the range starts beyond the final printable, it doesn't have any
+ * in it */
+ if (start > MAX_PRINT_A) {
+ break;
+ }
+
+ /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
+ * all but two, the range must start and end no later than 2 from
+ * either end */
+ if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+ if (end > MAX_PRINT_A) {
+ end = MAX_PRINT_A;
+ }
+ if (start < ' ') {
+ start = ' ';
+ }
+ if (end - start >= MAX_PRINT_A - ' ' - 2) {
+ allow_literals = FALSE;
+ }
+ break;
+ }
+ }
+ invlist_iterfinish(invlist);
+
+ /* Here we have figured things out. Output each range */
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ break;
+ }
+ put_range(sv, start, end, allow_literals);
+ }
+ invlist_iterfinish(invlist);
+
+ return;
+}
+
+STATIC SV*
+S_put_charclass_bitmap_innards_common(pTHX_
+ SV* invlist, /* The bitmap */
+ SV* posixes, /* Under /l, things like [:word:], \S */
+ SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
+ SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
+ SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
+ const bool invert /* Is the result to be inverted? */
+)
+{
+ /* Create and return an SV containing a displayable version of the bitmap
+ * and associated information determined by the input parameters. If the
+ * output would have been only the inversion indicator '^', NULL is instead
+ * returned. */
+
+ SV * output;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
+
+ if (invert) {
+ output = newSVpvs("^");
+ }
+ else {
+ output = newSVpvs("");
+ }
+
+ /* First, the code points in the bitmap that are unconditionally there */
+ put_charclass_bitmap_innards_invlist(output, invlist);
+
+ /* Traditionally, these have been placed after the main code points */
+ if (posixes) {
+ sv_catsv(output, posixes);
+ }
+
+ if (only_utf8 && _invlist_len(only_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8);
+ }
+
+ if (not_utf8 && _invlist_len(not_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, not_utf8);
+ }
+
+ if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
+
+ /* This is the only list in this routine that can legally contain code
+ * points outside the bitmap range. The call just above to
+ * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
+ * output them here. There's about a half-dozen possible, and none in
+ * contiguous ranges longer than 2 */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ UV start, end;
+ SV* above_bitmap = NULL;
+
+ _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
+
+ invlist_iterinit(above_bitmap);
+ while (invlist_iternext(above_bitmap, &start, &end)) {
+ UV i;
+
+ for (i = start; i <= end; i++) {
+ put_code_point(output, i);
+ }
+ }
+ invlist_iterfinish(above_bitmap);
+ SvREFCNT_dec_NN(above_bitmap);
+ }
+ }
+
+ if (invert && SvCUR(output) == 1) {
+ return NULL;
+ }
+
+ return output;
+}
+
+STATIC U8
+S_put_charclass_bitmap_innards(pTHX_ SV *sv,
+ char *bitmap,
+ SV *nonbitmap_invlist,
+ SV *only_utf8_locale_invlist,
+ const regnode * const node,
+ const U8 flags,
+ const bool force_as_is_display)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class defined by the other arguments:
+ * 'bitmap' points to the bitmap, or NULL if to ignore that.
+ * 'nonbitmap_invlist' is an inversion list of the code points that are in
+ * the bitmap range, but for some reason aren't in the bitmap; NULL if
+ * none. The reasons for this could be that they require some
+ * condition such as the target string being or not being in UTF-8
+ * (under /d), or because they came from a user-defined property that
+ * was not resolved at the time of the regex compilation (under /u)
+ * 'only_utf8_locale_invlist' is an inversion list of the code points that
+ * are valid only if the runtime locale is a UTF-8 one; NULL if none
+ * 'node' is the regex pattern ANYOF node. It is needed only when the
+ * above two parameters are not null, and is passed so that this
+ * routine can tease apart the various reasons for them.
+ * 'flags' is the flags field of 'node'
+ * 'force_as_is_display' is TRUE if this routine should definitely NOT try
+ * to invert things to see if that leads to a cleaner display. If
+ * FALSE, this routine is free to use its judgment about doing this.
+ *
+ * It returns 0 if nothing was actually output. (It may be that
+ * the bitmap, etc is empty.)
+ * 1 if the output wasn't inverted (didn't begin with a '^')
+ * 2 if the output was inverted (did begin with a '^')
+ *
+ * When called for outputting the bitmap of a non-ANYOF node, just pass the
+ * bitmap, with the succeeding parameters set to NULL, and the final one to
+ * FALSE.
+ */
+
+ /* In general, it tries to display the 'cleanest' representation of the
+ * innards, choosing whether to display them inverted or not, regardless of
+ * whether the class itself is to be inverted. However, there are some
+ * cases where it can't try inverting, as what actually matches isn't known
+ * until runtime, and hence the inversion isn't either. */
+
+ bool inverting_allowed = ! force_as_is_display;
+
+ int i;
+ STRLEN orig_sv_cur = SvCUR(sv);
+
+ SV* invlist; /* Inversion list we accumulate of code points that
+ are unconditionally matched */
+ SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
+ UTF-8 */
+ SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
+ */
+ SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
+ SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
+ is UTF-8 */
+
+ SV* as_is_display; /* The output string when we take the inputs
+ literally */
+ SV* inverted_display; /* The output string when we invert the inputs */
+
+ bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
+ to match? */
+ /* We are biased in favor of displaying things without them being inverted,
+ * as that is generally easier to understand */
+ const int bias = 5;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ /* Start off with whatever code points are passed in. (We clone, so we
+ * don't change the caller's list) */
+ if (nonbitmap_invlist) {
+ assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
+ invlist = invlist_clone(nonbitmap_invlist, NULL);
+ }
+ else { /* Worst case size is every other code point is matched */
+ invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+ }
+
+ if (flags) {
+ if (OP(node) == ANYOFD) {
+
+ /* This flag indicates that the code points below 0x100 in the
+ * nonbitmap list are precisely the ones that match only when the
+ * target is UTF-8 (they should all be non-ASCII). */
+ if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
+ _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
+ _invlist_subtract(invlist, only_utf8, &invlist);
+ }
+
+ /* And this flag for matching all non-ASCII 0xFF and below */
+ if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
+ not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
+ }
+ }
+ else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
+
+ /* If either of these flags are set, what matches isn't
+ * determinable except during execution, so don't know enough here
+ * to invert */
+ if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
+ inverting_allowed = FALSE;
+ }
+
+ /* What the posix classes match also varies at runtime, so these
+ * will be output symbolically. */
+ if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
+ int i;
+
+ posixes = newSVpvs("");
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+ if (ANYOF_POSIXL_TEST(node, i)) {
+ sv_catpv(posixes, anyofs[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /* Accumulate the bit map into the unconditional match list */
+ if (bitmap) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ int start = i++;
+ for (;
+ i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
+ i++)
+ { /* empty */ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
+ }
+ }
+ }
+
+ /* Make sure that the conditional match lists don't have anything in them
+ * that match unconditionally; otherwise the output is quite confusing.
+ * This could happen if the code that populates these misses some
+ * duplication. */
+ if (only_utf8) {
+ _invlist_subtract(only_utf8, invlist, &only_utf8);
+ }
+ if (not_utf8) {
+ _invlist_subtract(not_utf8, invlist, &not_utf8);
+ }
+
+ if (only_utf8_locale_invlist) {
+
+ /* Since this list is passed in, we have to make a copy before
+ * modifying it */
+ only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
+
+ _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
+
+ /* And, it can get really weird for us to try outputting an inverted
+ * form of this list when it has things above the bitmap, so don't even
+ * try */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ inverting_allowed = FALSE;
+ }
+ }
+
+ /* Calculate what the output would be if we take the input as-is */
+ as_is_display = put_charclass_bitmap_innards_common(invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale,
+ invert);
+
+ /* If have to take the output as-is, just do that */
+ if (! inverting_allowed) {
+ if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ SvREFCNT_dec_NN(as_is_display);
+ }
+ }
+ else { /* But otherwise, create the output again on the inverted input, and
+ use whichever version is shorter */
+
+ int inverted_bias, as_is_bias;
+
+ /* We will apply our bias to whichever of the results doesn't have
+ * the '^' */
+ bool trial_invert;
+ if (invert) {
+ trial_invert = FALSE;
+ as_is_bias = bias;
+ inverted_bias = 0;
+ }
+ else {
+ trial_invert = TRUE;
+ as_is_bias = 0;
+ inverted_bias = bias;
+ }
+
+ /* Now invert each of the lists that contribute to the output,
+ * excluding from the result things outside the possible range */
+
+ /* For the unconditional inversion list, we have to add in all the
+ * conditional code points, so that when inverted, they will be gone
+ * from it */
+ _invlist_union(only_utf8, invlist, &invlist);
+ _invlist_union(not_utf8, invlist, &invlist);
+ _invlist_union(only_utf8_locale, invlist, &invlist);
+ _invlist_invert(invlist);
+ _invlist_intersection(invlist, PL_InBitmap, &invlist);
+
+ if (only_utf8) {
+ _invlist_invert(only_utf8);
+ _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
+ }
+ else if (not_utf8) {
+
+ /* If a code point matches iff the target string is not in UTF-8,
+ * then complementing the result has it not match iff not in UTF-8,
+ * which is the same thing as matching iff it is UTF-8. */
+ only_utf8 = not_utf8;
+ not_utf8 = NULL;
+ }
+
+ if (only_utf8_locale) {
+ _invlist_invert(only_utf8_locale);
+ _invlist_intersection(only_utf8_locale,
+ PL_InBitmap,
+ &only_utf8_locale);
+ }
+
+ inverted_display = put_charclass_bitmap_innards_common(
+ invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale, trial_invert);
+
+ /* Use the shortest representation, taking into account our bias
+ * against showing it inverted */
+ if ( inverted_display
+ && ( ! as_is_display
+ || ( SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)))
+ {
+ sv_catsv(sv, inverted_display);
+ invert = ! invert;
+ }
+ else if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ }
+
+ SvREFCNT_dec(as_is_display);
+ SvREFCNT_dec(inverted_display);
+ }
+
+ SvREFCNT_dec_NN(invlist);
+ SvREFCNT_dec(only_utf8);
+ SvREFCNT_dec(not_utf8);
+ SvREFCNT_dec(posixes);
+ SvREFCNT_dec(only_utf8_locale);
+
+ U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
+ if (did_output_something) {
+ /* Distinguish between non and inverted cases */
+ did_output_something += invert;
+ }
+
+ return did_output_something;
+}
+
+
+const regnode *
+Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
+ const regnode *last, const regnode *plast,
+ SV* sv, I32 indent, U32 depth)
+{
+ const regnode *next;
+ const regnode *optstart= NULL;
+
+ RXi_GET_DECL(r, ri);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_DUMPUNTIL;
+
+#ifdef DEBUG_DUMPUNTIL
+ Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
+ last ? last-start : 0, plast ? plast-start : 0);
+#endif
+
+ if (plast && plast < last)
+ last= plast;
+
+ while (node && (!last || node < last)) {
+ const U8 op = OP(node);
+
+ if (op == CLOSE || op == SRCLOSE || op == WHILEM)
+ indent--;
+ next = regnext((regnode *)node);
+ const regnode *after = regnode_after((regnode *)node,0);
+
+ /* Where, what. */
+ if (op == OPTIMIZED) {
+ if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
+ optstart = node;
+ else
+ goto after_print;
+ } else
+ CLEAR_OPTSTART;
+
+ regprop(r, sv, node, NULL, NULL);
+ Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
+ (int)(2*indent + 1), "", SvPVX_const(sv));
+
+ if (op != OPTIMIZED) {
+ if (next == NULL) /* Next ptr. */
+ Perl_re_printf( aTHX_ " (0)");
+ else if (REGNODE_TYPE(op) == BRANCH
+ && REGNODE_TYPE(OP(next)) != BRANCH )
+ Perl_re_printf( aTHX_ " (FAIL)");
+ else
+ Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
+ Perl_re_printf( aTHX_ "\n");
+ }
+
+ after_print:
+ if (REGNODE_TYPE(op) == BRANCHJ) {
+ assert(next);
+ const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
+ if (last && nnode > last)
+ nnode = last;
+ DUMPUNTIL(after, nnode);
+ }
+ else if (REGNODE_TYPE(op) == BRANCH) {
+ assert(next);
+ DUMPUNTIL(after, next);
+ }
+ else if ( REGNODE_TYPE(op) == TRIE ) {
+ const regnode *this_trie = node;
+ const U32 n = ARG(node);
+ const reg_ac_data * const ac = op>=AHOCORASICK ?
+ (reg_ac_data *)ri->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie =
+ (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
+#ifdef DEBUGGING
+ AV *const trie_words
+ = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
+#endif
+ const regnode *nextbranch= NULL;
+ I32 word_idx;
+ SvPVCLEAR(sv);
+ for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
+ SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
+
+ Perl_re_indentf( aTHX_ "%s ",
+ indent+3,
+ elem_ptr
+ ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
+ SvCUR(*elem_ptr), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
+ (SvUTF8(*elem_ptr)
+ ? PERL_PV_ESCAPE_UNI
+ : 0)
+ | PERL_PV_PRETTY_ELLIPSES
+ | PERL_PV_PRETTY_LTGT
+ )
+ : "???"
+ );
+ if (trie->jump) {
+ U16 dist= trie->jump[word_idx+1];
+ Perl_re_printf( aTHX_ "(%" UVuf ")\n",
+ (UV)((dist ? this_trie + dist : next) - start));
+ if (dist) {
+ if (!nextbranch)
+ nextbranch= this_trie + trie->jump[0];
+ DUMPUNTIL(this_trie + dist, nextbranch);
+ }
+ if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
+ nextbranch= regnext((regnode *)nextbranch);
+ } else {
+ Perl_re_printf( aTHX_ "\n");
+ }
+ }
+ if (last && next > last)
+ node= last;
+ else
+ node= next;
+ }
+ else if ( op == CURLY ) { /* "next" might be very big: optimizer */
+ DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
+ }
+ else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
+ assert(next);
+ DUMPUNTIL(after, next);
+ }
+ else if ( op == PLUS || op == STAR) {
+ DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
+ }
+ else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
+ /* Literal string, where present. */
+ node = (const regnode *)REGNODE_AFTER_varies(node);
+ }
+ else {
+ node = REGNODE_AFTER_opcode(node,op);
+ }
+ if (op == CURLYX || op == OPEN || op == SROPEN)
+ indent++;
+ if (REGNODE_TYPE(op) == END)
+ break;
+ }
+ CLEAR_OPTSTART;
+#ifdef DEBUG_DUMPUNTIL
+ Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
+#endif
+ return node;
+}
+
+#endif /* DEBUGGING */