summaryrefslogtreecommitdiff
path: root/regcomp_debug.c
diff options
context:
space:
mode:
Diffstat (limited to 'regcomp_debug.c')
-rw-r--r--regcomp_debug.c1625
1 files changed, 1625 insertions, 0 deletions
diff --git a/regcomp_debug.c b/regcomp_debug.c
new file mode 100644
index 0000000000..c7c42f6941
--- /dev/null
+++ b/regcomp_debug.c
@@ -0,0 +1,1625 @@
+#ifdef PERL_EXT_RE_BUILD
+#include "re_top.h"
+#endif
+
+#include "EXTERN.h"
+#define PERL_IN_REGEX_ENGINE
+#define PERL_IN_REGCOMP_ANY
+#define PERL_IN_REGCOMP_DEBUG_C
+#include "perl.h"
+
+#ifdef PERL_IN_XSUB_RE
+# include "re_comp.h"
+#else
+# include "regcomp.h"
+#endif
+
+#include "invlist_inline.h"
+#include "unicode_constants.h"
+#include "regcomp_internal.h"
+
+#ifdef DEBUGGING
+
+int
+Perl_re_printf(pTHX_ const char *fmt, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_PRINTF;
+ va_start(ap, fmt);
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+
+int
+Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
+{
+ va_list ap;
+ int result;
+ PerlIO *f= Perl_debug_log;
+ PERL_ARGS_ASSERT_RE_INDENTF;
+ va_start(ap, depth);
+ PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
+ result = PerlIO_vprintf(f, fmt, ap);
+ va_end(ap);
+ return result;
+}
+
+void
+Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
+ const char *close_str)
+{
+ PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
+ if (!flags)
+ return;
+
+ Perl_re_printf( aTHX_ "%s", open_str);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
+ Perl_re_printf( aTHX_ "%s", close_str);
+}
+
+void
+Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
+ U32 depth, int is_inf,
+ SSize_t min, SSize_t stopmin, SSize_t delta)
+{
+ PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ DEBUG_OPTIMISE_MORE_r({
+ if (!data)
+ return;
+ Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ depth,
+ where,
+ min, stopmin, delta,
+ (IV)data->pos_min,
+ (IV)data->pos_delta,
+ (UV)data->flags
+ );
+
+ Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
+
+ Perl_re_printf( aTHX_
+ " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
+ (IV)data->whilem_c,
+ (IV)(data->last_closep ? *((data)->last_closep) : -1),
+ is_inf ? "INF " : ""
+ );
+
+ if (data->last_found) {
+ int i;
+ Perl_re_printf(aTHX_
+ "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
+ SvPVX_const(data->last_found),
+ (IV)data->last_end,
+ (IV)data->last_start_min,
+ (IV)data->last_start_max
+ );
+
+ for (i = 0; i < 2; i++) {
+ Perl_re_printf(aTHX_
+ " %s%s: '%s' @ %" IVdf "/%" IVdf,
+ data->cur_is_floating == i ? "*" : "",
+ i ? "Float" : "Fixed",
+ SvPVX_const(data->substrs[i].str),
+ (IV)data->substrs[i].min_offset,
+ (IV)data->substrs[i].max_offset
+ );
+ Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
+ }
+ }
+
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+void
+Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
+ regnode *scan, U32 depth, U32 flags)
+{
+ PERL_ARGS_ASSERT_DEBUG_PEEP;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ DEBUG_OPTIMISE_r({
+ regnode *Next;
+
+ if (!scan)
+ return;
+ Next = regnext(scan);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
+ Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
+ depth,
+ str,
+ REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
+ Next ? (REG_NODE_NUM(Next)) : 0 );
+ Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+#endif /* DEBUGGING */
+
+/*
+ - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
+ */
+#ifdef DEBUGGING
+
+static void
+S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+
+ ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
+ if (flags & (1<<bit)) {
+ if (!set++ && lead)
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
+ }
+ }
+ if (lead) {
+ if (set)
+ Perl_re_printf( aTHX_ "\n");
+ else
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
+ }
+}
+
+static void
+S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
+{
+ int bit;
+ int set=0;
+ regex_charset cs;
+
+ ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
+ if (flags & (1U<<bit)) {
+ if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
+ continue;
+ }
+ if (!set++ && lead)
+ Perl_re_printf( aTHX_ "%s", lead);
+ Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
+ }
+ }
+ if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
+ if (!set++ && lead) {
+ Perl_re_printf( aTHX_ "%s", lead);
+ }
+ switch (cs) {
+ case REGEX_UNICODE_CHARSET:
+ Perl_re_printf( aTHX_ "UNICODE");
+ break;
+ case REGEX_LOCALE_CHARSET:
+ Perl_re_printf( aTHX_ "LOCALE");
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
+ break;
+ case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
+ Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
+ break;
+ default:
+ Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
+ break;
+ }
+ }
+ if (lead) {
+ if (set)
+ Perl_re_printf( aTHX_ "\n");
+ else
+ Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
+ }
+}
+#endif
+
+void
+Perl_regdump(pTHX_ const regexp *r)
+{
+#ifdef DEBUGGING
+ int i;
+ SV * const sv = sv_newmortal();
+ SV *dsv= sv_newmortal();
+ RXi_GET_DECL(r, ri);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_REGDUMP;
+
+ (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
+
+ /* Header fields of interest. */
+ for (i = 0; i < 2; i++) {
+ if (r->substrs->data[i].substr) {
+ RE_PV_QUOTED_DECL(s, 0, dsv,
+ SvPVX_const(r->substrs->data[i].substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].substr),
+ PL_dump_re_max_len);
+ Perl_re_printf( aTHX_
+ "%s %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ else if (r->substrs->data[i].utf8_substr) {
+ RE_PV_QUOTED_DECL(s, 1, dsv,
+ SvPVX_const(r->substrs->data[i].utf8_substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].utf8_substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ }
+
+ if (r->check_substr || r->check_utf8)
+ Perl_re_printf( aTHX_
+ (const char *)
+ ( r->check_substr == r->substrs->data[1].substr
+ && r->check_utf8 == r->substrs->data[1].utf8_substr
+ ? "(checking floating" : "(checking anchored"));
+ if (r->intflags & PREGf_NOSCAN)
+ Perl_re_printf( aTHX_ " noscan");
+ if (r->extflags & RXf_CHECK_ALL)
+ Perl_re_printf( aTHX_ " isall");
+ if (r->check_substr || r->check_utf8)
+ Perl_re_printf( aTHX_ ") ");
+
+ if (ri->regstclass) {
+ regprop(r, sv, ri->regstclass, NULL, NULL);
+ Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
+ }
+ if (r->intflags & PREGf_ANCH) {
+ Perl_re_printf( aTHX_ "anchored");
+ if (r->intflags & PREGf_ANCH_MBOL)
+ Perl_re_printf( aTHX_ "(MBOL)");
+ if (r->intflags & PREGf_ANCH_SBOL)
+ Perl_re_printf( aTHX_ "(SBOL)");
+ if (r->intflags & PREGf_ANCH_GPOS)
+ Perl_re_printf( aTHX_ "(GPOS)");
+ Perl_re_printf( aTHX_ " ");
+ }
+ if (r->intflags & PREGf_GPOS_SEEN)
+ Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
+ if (r->intflags & PREGf_SKIP)
+ Perl_re_printf( aTHX_ "plus ");
+ if (r->intflags & PREGf_IMPLICIT)
+ Perl_re_printf( aTHX_ "implicit ");
+ Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
+ if (r->extflags & RXf_EVAL_SEEN)
+ Perl_re_printf( aTHX_ "with eval ");
+ Perl_re_printf( aTHX_ "\n");
+ DEBUG_FLAGS_r({
+ regdump_extflags("r->extflags: ", r->extflags);
+ regdump_intflags("r->intflags: ", r->intflags);
+ });
+#else
+ PERL_ARGS_ASSERT_REGDUMP;
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(r);
+#endif /* DEBUGGING */
+}
+
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+# if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \
+ || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \
+ || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \
+ || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \
+ || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \
+ || CC_VERTSPACE_ != 15
+# error Need to adjust order of anyofs[]
+# endif
+static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:cased:]",
+ "[:^cased:]",
+ "\\s",
+ "\\S",
+ "[:blank:]",
+ "[:^blank:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "\\v",
+ "\\V"
+};
+#endif
+
+/*
+- regprop - printable representation of opcode, with run time support
+*/
+
+void
+Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
+{
+#ifdef DEBUGGING
+ U8 k;
+ const U8 op = OP(o);
+ RXi_GET_DECL(prog, progi);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_REGPROP;
+
+ SvPVCLEAR(sv);
+
+ if (op > REGNODE_MAX) { /* regnode.type is unsigned */
+ if (pRExC_state) { /* This gives more info, if we have it */
+ FAIL3("panic: corrupted regexp opcode %d > %d",
+ (int)op, (int)REGNODE_MAX);
+ }
+ else {
+ Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
+ (int)op, (int)REGNODE_MAX);
+ }
+ }
+ sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
+
+ k = REGNODE_TYPE(op);
+
+ if (k == EXACT) {
+ sv_catpvs(sv, " ");
+ /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
+ * is a crude hack but it may be the best for now since
+ * we have no flag "this EXACTish node was UTF-8"
+ * --jhi */
+ pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
+ PERL_PV_ESCAPE_UNI_DETECT |
+ PERL_PV_ESCAPE_NONASCII |
+ PERL_PV_PRETTY_ELLIPSES |
+ PERL_PV_PRETTY_LTGT |
+ PERL_PV_PRETTY_NOCLEAR
+ );
+ } else if (k == TRIE) {
+ /* print the details of the trie in dumpuntil instead, as
+ * progi->data isn't available here */
+ const U32 n = ARG(o);
+ const reg_ac_data * const ac = IS_TRIE_AC(op) ?
+ (reg_ac_data *)progi->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie
+ = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
+
+ Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(o->flags));
+ DEBUG_TRIE_COMPILE_r({
+ if (trie->jump)
+ sv_catpvs(sv, "(JUMP)");
+ Perl_sv_catpvf(aTHX_ sv,
+ "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
+ (UV)trie->startstate,
+ (IV)trie->statecount-1, /* -1 because of the unused 0 element */
+ (UV)trie->wordcount,
+ (UV)trie->minlen,
+ (UV)trie->maxlen,
+ (UV)TRIE_CHARCOUNT(trie),
+ (UV)trie->uniquecharcount
+ );
+ });
+ if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
+ sv_catpvs(sv, "[");
+ (void) put_charclass_bitmap_innards(sv,
+ ((IS_ANYOF_TRIE(op))
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie)),
+ NULL,
+ NULL,
+ NULL,
+ 0,
+ FALSE
+ );
+ sv_catpvs(sv, "]");
+ }
+ } else if (k == CURLY) {
+ U32 lo = ARG1(o), hi = ARG2(o);
+ if (op == CURLYM || op == CURLYN || op == CURLYX)
+ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
+ Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
+ if (hi == REG_INFTY)
+ sv_catpvs(sv, "INFTY");
+ else
+ Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
+ sv_catpvs(sv, "}");
+ }
+ else if (k == WHILEM && o->flags) /* Ordinal/of */
+ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
+ else if (k == REF || k == OPEN || k == CLOSE
+ || k == GROUPP || op == ACCEPT)
+ {
+ AV *name_list= NULL;
+ U32 parno= (op == ACCEPT) ? (U32)ARG2L(o) :
+ (op == OPEN || op == CLOSE) ? (U32)PARNO(o) :
+ (U32)ARG(o);
+ Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
+ if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+ if ( name_list ) {
+ if ( k != REF || (op < REFN)) {
+ SV **name= av_fetch_simple(name_list, parno, 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ else
+ if (parno > 0) {
+ /* parno must always be larger than 0 for this block
+ * as it represents a slot into the data array, which
+ * has the 0 slot reserved for a placeholder so any valid
+ * index into it is always true, eg non-zero
+ * see the '%' "what" type and the implementation of
+ * S_reg_add_data()
+ */
+ SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
+ I32 *nums=(I32*)SvPVX(sv_dat);
+ SV **name= av_fetch_simple(name_list, nums[0], 0 );
+ I32 n;
+ if (name) {
+ for ( n=0; n<SvIVX(sv_dat); n++ ) {
+ Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
+ (n ? "," : ""), (IV)nums[n]);
+ }
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ }
+ }
+ if ( k == REF && reginfo) {
+ U32 n = ARG(o); /* which paren pair */
+ I32 ln = prog->offs[n].start;
+ if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
+ Perl_sv_catpvf(aTHX_ sv, ": FAIL");
+ else if (ln == prog->offs[n].end)
+ Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
+ else {
+ const char *s = reginfo->strbeg + ln;
+ Perl_sv_catpvf(aTHX_ sv, ": ");
+ Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
+ PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
+ }
+ }
+ } else if (k == GOSUB) {
+ AV *name_list= NULL;
+ if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+
+ /* Paren and offset */
+ Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
+ (int)((o + (int)ARG2L(o)) - progi->program) );
+ if (name_list) {
+ SV **name= av_fetch_simple(name_list, ARG(o), 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
+ }
+ }
+ else if (k == LOGICAL)
+ /* 2: embedded, otherwise 1 */
+ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
+ else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
+ U8 flags;
+ char * bitmap;
+ U8 do_sep = 0; /* Do we need to separate various components of the
+ output? */
+ /* Set if there is still an unresolved user-defined property */
+ SV *unresolved = NULL;
+
+ /* Things that are ignored except when the runtime locale is UTF-8 */
+ SV *only_utf8_locale_invlist = NULL;
+
+ /* Code points that don't fit in the bitmap */
+ SV *nonbitmap_invlist = NULL;
+
+ /* And things that aren't in the bitmap, but are small enough to be */
+ SV* bitmap_range_not_in_bitmap = NULL;
+
+ bool inverted;
+
+ if (k != ANYOF) {
+ flags = 0;
+ bitmap = NULL;
+ }
+ else {
+ flags = ANYOF_FLAGS(o);
+ bitmap = ANYOF_BITMAP(o);
+ }
+
+ if (op == ANYOFL || op == ANYOFPOSIXL) {
+ if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
+ sv_catpvs(sv, "{utf8-locale-reqd}");
+ }
+ if (flags & ANYOFL_FOLD) {
+ sv_catpvs(sv, "{i}");
+ }
+ }
+
+ inverted = flags & ANYOF_INVERT;
+
+ /* If there is stuff outside the bitmap, get it */
+ if (k == ANYOFR) {
+
+ /* For a single range, split into the parts inside vs outside the
+ * bitmap. */
+ UV start = ANYOFRbase(o);
+ UV end = ANYOFRbase(o) + ANYOFRdelta(o);
+
+ if (start < NUM_ANYOF_CODE_POINTS) {
+ if (end < NUM_ANYOF_CODE_POINTS) {
+ bitmap_range_not_in_bitmap
+ = _add_range_to_invlist(bitmap_range_not_in_bitmap,
+ start, end);
+ }
+ else {
+ bitmap_range_not_in_bitmap
+ = _add_range_to_invlist(bitmap_range_not_in_bitmap,
+ start, NUM_ANYOF_CODE_POINTS);
+ start = NUM_ANYOF_CODE_POINTS;
+ }
+ }
+
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ ANYOFRbase(o),
+ ANYOFRbase(o) + ANYOFRdelta(o));
+ }
+ }
+ else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ NUM_ANYOF_CODE_POINTS,
+ UV_MAX);
+ }
+ else if (ANYOF_HAS_AUX(o)) {
+ (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
+ &unresolved,
+ &only_utf8_locale_invlist,
+ &nonbitmap_invlist);
+
+ /* The aux data may contain stuff that could fit in the bitmap.
+ * This could come from a user-defined property being finally
+ * resolved when this call was done; or much more likely because
+ * there are matches that require UTF-8 to be valid, and so aren't
+ * in the bitmap (or ANYOFR). This is teased apart later */
+ _invlist_intersection(nonbitmap_invlist,
+ PL_InBitmap,
+ &bitmap_range_not_in_bitmap);
+ /* Leave just the things that don't fit into the bitmap */
+ _invlist_subtract(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
+
+ /* Ready to start outputting. First, the initial left bracket */
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ if ( bitmap
+ || bitmap_range_not_in_bitmap
+ || only_utf8_locale_invlist
+ || unresolved)
+ {
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(
+ sv,
+ bitmap,
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o,
+ flags,
+
+ /* Can't try inverting for a
+ * better display if there
+ * are things that haven't
+ * been resolved */
+ (unresolved != NULL || k == ANYOFR));
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined
+ * yet, output them. If the result is not to be inverted, it is
+ * clearest to output them in a separate [] from the bitmap range
+ * stuff. If the result is to be complemented, we have to show
+ * everything in one [], as the inversion applies to the whole
+ * thing. Use {braces} to separate them from anything in the
+ * bitmap and anything above the bitmap. */
+ if (unresolved) {
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap
+ */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
+ }
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
+ PL_colors[0]);
+ }
+ sv_catsv(sv, unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
+ }
+ else if ( do_sep == 2
+ && ! nonbitmap_invlist
+ && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
+ {
+ /* Here, the display shows the class as inverted, and
+ * everything above the lower display should also match, but
+ * there is no indication of that. Add this range so the code
+ * below will add it to the display */
+ _invlist_union_complement_2nd(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
+ }
+
+ /* And, finally, add the above-the-bitmap stuff */
+ if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
+ SV* contents;
+
+ /* See if truncation size is overridden */
+ const STRLEN dump_len = (PL_dump_re_max_len > 256)
+ ? PL_dump_re_max_len
+ : 256;
+
+ /* This is output in a separate [] */
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
+ }
+
+ /* And, for easy of understanding, it is shown in the
+ * uncomplemented form if possible. The one exception being if
+ * there are unresolved items, where the inversion has to be
+ * delayed until runtime */
+ if (inverted && ! unresolved) {
+ _invlist_invert(nonbitmap_invlist);
+ _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
+ }
+
+ contents = invlist_contents(nonbitmap_invlist,
+ FALSE /* output suitable for catsv */
+ );
+
+ /* If the output is shorter than the permissible maximum, just do it. */
+ if (SvCUR(contents) <= dump_len) {
+ sv_catsv(sv, contents);
+ }
+ else {
+ const char * contents_string = SvPVX(contents);
+ STRLEN i = dump_len;
+
+ /* Otherwise, start at the permissible max and work back to the
+ * first break possibility */
+ while (i > 0 && contents_string[i] != ' ') {
+ i--;
+ }
+ if (i == 0) { /* Fail-safe. Use the max if we couldn't
+ find a legal break */
+ i = dump_len;
+ }
+
+ sv_catpvn(sv, contents_string, i);
+ sv_catpvs(sv, "...");
+ }
+
+ SvREFCNT_dec_NN(contents);
+ SvREFCNT_dec_NN(nonbitmap_invlist);
+ }
+
+ /* And finally the matching, closing ']' */
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ if (op == ANYOFHs) {
+ Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+ }
+ else if (REGNODE_TYPE(op) != ANYOF) {
+ U8 lowest = (op != ANYOFHr)
+ ? FLAGS(o)
+ : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
+ U8 highest = (op == ANYOFHr)
+ ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+ : (op == ANYOFH || op == ANYOFR)
+ ? 0xFF
+ : lowest;
+#ifndef EBCDIC
+ if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+ {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
+ }
+ }
+
+ SvREFCNT_dec(unresolved);
+ }
+ else if (k == ANYOFM) {
+ SV * cp_list = get_ANYOFM_contents(o);
+
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ if (op == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
+
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
+ else if (k == ANYOFHbbm) {
+ SV * cp_list = get_ANYOFHbbm_contents(o);
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ sv_catsv(sv, invlist_contents(cp_list,
+ FALSE /* output suitable for catsv */
+ ));
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
+ else if (k == POSIXD || k == NPOSIXD) {
+ U8 index = FLAGS(o) * 2;
+ if (index < C_ARRAY_LENGTH(anyofs)) {
+ if (*anyofs[index] != '[') {
+ sv_catpvs(sv, "[");
+ }
+ sv_catpv(sv, anyofs[index]);
+ if (*anyofs[index] != '[') {
+ sv_catpvs(sv, "]");
+ }
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
+ }
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}",
+ "{lb}",
+ "{sb}",
+ "{wb}"
+ };
+ assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
+ else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
+ Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+ if (o->next_off) {
+ Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+ }
+ Perl_sv_catpvf(aTHX_ sv, "]");
+ }
+ else if (op == SBOL)
+ Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+ /* add on the verb argument if there is one */
+ if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && o->flags) {
+ if ( ARG(o) )
+ Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
+ SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ else
+ sv_catpvs(sv, ":NULL");
+ }
+#else
+ PERL_UNUSED_CONTEXT;
+ PERL_UNUSED_ARG(sv);
+ PERL_UNUSED_ARG(o);
+ PERL_UNUSED_ARG(prog);
+ PERL_UNUSED_ARG(reginfo);
+ PERL_UNUSED_ARG(pRExC_state);
+#endif /* DEBUGGING */
+}
+
+#ifdef DEBUGGING
+
+STATIC void
+S_put_code_point(pTHX_ SV *sv, UV c)
+{
+ PERL_ARGS_ASSERT_PUT_CODE_POINT;
+
+ if (c > 255) {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
+ }
+ else if (isPRINT(c)) {
+ const char string = (char) c;
+
+ /* We use {phrase} as metanotation in the class, so also escape literal
+ * braces */
+ if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
+ sv_catpvs(sv, "\\");
+ sv_catpvn(sv, &string, 1);
+ }
+ else if (isMNEMONIC_CNTRL(c)) {
+ Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
+ }
+}
+
+STATIC void
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
+{
+ /* Appends to 'sv' a displayable version of the range of code points from
+ * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
+ * that have them, when they occur at the beginning or end of the range.
+ * It uses hex to output the remaining code points, unless 'allow_literals'
+ * is true, in which case the printable ASCII ones are output as-is (though
+ * some of these will be escaped by put_code_point()).
+ *
+ * NOTE: This is designed only for printing ranges of code points that fit
+ * inside an ANYOF bitmap. Higher code points are simply suppressed
+ */
+
+ const unsigned int min_range_count = 3;
+
+ assert(start <= end);
+
+ PERL_ARGS_ASSERT_PUT_RANGE;
+
+ while (start <= end) {
+ UV this_end;
+ const char * format;
+
+ if ( end - start < min_range_count
+ && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
+ {
+ /* Output a range of 1 or 2 chars individually, or longer ranges
+ * when printable */
+ for (; start <= end; start++) {
+ put_code_point(sv, start);
+ }
+ break;
+ }
+
+ /* If permitted by the input options, and there is a possibility that
+ * this range contains a printable literal, look to see if there is
+ * one. */
+ if (allow_literals && start <= MAX_PRINT_A) {
+
+ /* If the character at the beginning of the range isn't an ASCII
+ * printable, effectively split the range into two parts:
+ * 1) the portion before the first such printable,
+ * 2) the rest
+ * and output them separately. */
+ if (! isPRINT_A(start)) {
+ UV temp_end = start + 1;
+
+ /* There is no point looking beyond the final possible
+ * printable, in MAX_PRINT_A */
+ UV max = MIN(end, MAX_PRINT_A);
+
+ while (temp_end <= max && ! isPRINT_A(temp_end)) {
+ temp_end++;
+ }
+
+ /* Here, temp_end points to one beyond the first printable if
+ * found, or to one beyond 'max' if not. If none found, make
+ * sure that we use the entire range */
+ if (temp_end > MAX_PRINT_A) {
+ temp_end = end + 1;
+ }
+
+ /* Output the first part of the split range: the part that
+ * doesn't have printables, with the parameter set to not look
+ * for literals (otherwise we would infinitely recurse) */
+ put_range(sv, start, temp_end - 1, FALSE);
+
+ /* The 2nd part of the range (if any) starts here. */
+ start = temp_end;
+
+ /* We do a continue, instead of dropping down, because even if
+ * the 2nd part is non-empty, it could be so short that we want
+ * to output it as individual characters, as tested for at the
+ * top of this loop. */
+ continue;
+ }
+
+ /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
+ * output a sub-range of just the digits or letters, then process
+ * the remaining portion as usual. */
+ if (isALPHANUMERIC_A(start)) {
+ UV mask = (isDIGIT_A(start))
+ ? CC_DIGIT_
+ : isUPPER_A(start)
+ ? CC_UPPER_
+ : CC_LOWER_;
+ UV temp_end = start + 1;
+
+ /* Find the end of the sub-range that includes just the
+ * characters in the same class as the first character in it */
+ while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
+ temp_end++;
+ }
+ temp_end--;
+
+ /* For short ranges, don't duplicate the code above to output
+ * them; just call recursively */
+ if (temp_end - start < min_range_count) {
+ put_range(sv, start, temp_end, FALSE);
+ }
+ else { /* Output as a range */
+ put_code_point(sv, start);
+ sv_catpvs(sv, "-");
+ put_code_point(sv, temp_end);
+ }
+ start = temp_end + 1;
+ continue;
+ }
+
+ /* We output any other printables as individual characters */
+ if (isPUNCT_A(start) || isSPACE_A(start)) {
+ while (start <= end && (isPUNCT_A(start)
+ || isSPACE_A(start)))
+ {
+ put_code_point(sv, start);
+ start++;
+ }
+ continue;
+ }
+ } /* End of looking for literals */
+
+ /* Here is not to output as a literal. Some control characters have
+ * mnemonic names. Split off any of those at the beginning and end of
+ * the range to print mnemonically. It isn't possible for many of
+ * these to be in a row, so this won't overwhelm with output */
+ if ( start <= end
+ && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
+ {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+
+ /* If this didn't take care of the whole range ... */
+ if (start <= end) {
+
+ /* Look backwards from the end to find the final non-mnemonic
+ * */
+ UV temp_end = end;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the interior range that doesn't start
+ * or end with mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+ break;
+ }
+ }
+
+ /* As a final resort, output the range or subrange as hex. */
+
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ this_end = end;
+ }
+ else { /* Have to split range at the bitmap boundary */
+ this_end = (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
+ }
+#if NUM_ANYOF_CODE_POINTS > 256
+ format = (this_end < 256)
+ ? "\\x%02" UVXf "-\\x%02" UVXf
+ : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
+#else
+ format = "\\x%02" UVXf "-\\x%02" UVXf;
+#endif
+ GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
+ Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
+ GCC_DIAG_RESTORE_STMT;
+ break;
+ }
+}
+
+STATIC void
+S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
+{
+ /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
+ * 'invlist' */
+
+ UV start, end;
+ bool allow_literals = TRUE;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
+
+ /* Generally, it is more readable if printable characters are output as
+ * literals, but if a range (nearly) spans all of them, it's best to output
+ * it as a single range. This code will use a single range if all but 2
+ * ASCII printables are in it */
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
+
+ /* If the range starts beyond the final printable, it doesn't have any
+ * in it */
+ if (start > MAX_PRINT_A) {
+ break;
+ }
+
+ /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
+ * all but two, the range must start and end no later than 2 from
+ * either end */
+ if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+ if (end > MAX_PRINT_A) {
+ end = MAX_PRINT_A;
+ }
+ if (start < ' ') {
+ start = ' ';
+ }
+ if (end - start >= MAX_PRINT_A - ' ' - 2) {
+ allow_literals = FALSE;
+ }
+ break;
+ }
+ }
+ invlist_iterfinish(invlist);
+
+ /* Here we have figured things out. Output each range */
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ break;
+ }
+ put_range(sv, start, end, allow_literals);
+ }
+ invlist_iterfinish(invlist);
+
+ return;
+}
+
+STATIC SV*
+S_put_charclass_bitmap_innards_common(pTHX_
+ SV* invlist, /* The bitmap */
+ SV* posixes, /* Under /l, things like [:word:], \S */
+ SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
+ SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
+ SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
+ const bool invert /* Is the result to be inverted? */
+)
+{
+ /* Create and return an SV containing a displayable version of the bitmap
+ * and associated information determined by the input parameters. If the
+ * output would have been only the inversion indicator '^', NULL is instead
+ * returned. */
+
+ SV * output;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
+
+ if (invert) {
+ output = newSVpvs("^");
+ }
+ else {
+ output = newSVpvs("");
+ }
+
+ /* First, the code points in the bitmap that are unconditionally there */
+ put_charclass_bitmap_innards_invlist(output, invlist);
+
+ /* Traditionally, these have been placed after the main code points */
+ if (posixes) {
+ sv_catsv(output, posixes);
+ }
+
+ if (only_utf8 && _invlist_len(only_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8);
+ }
+
+ if (not_utf8 && _invlist_len(not_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, not_utf8);
+ }
+
+ if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
+
+ /* This is the only list in this routine that can legally contain code
+ * points outside the bitmap range. The call just above to
+ * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
+ * output them here. There's about a half-dozen possible, and none in
+ * contiguous ranges longer than 2 */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ UV start, end;
+ SV* above_bitmap = NULL;
+
+ _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
+
+ invlist_iterinit(above_bitmap);
+ while (invlist_iternext(above_bitmap, &start, &end)) {
+ UV i;
+
+ for (i = start; i <= end; i++) {
+ put_code_point(output, i);
+ }
+ }
+ invlist_iterfinish(above_bitmap);
+ SvREFCNT_dec_NN(above_bitmap);
+ }
+ }
+
+ if (invert && SvCUR(output) == 1) {
+ return NULL;
+ }
+
+ return output;
+}
+
+STATIC U8
+S_put_charclass_bitmap_innards(pTHX_ SV *sv,
+ char *bitmap,
+ SV *nonbitmap_invlist,
+ SV *only_utf8_locale_invlist,
+ const regnode * const node,
+ const U8 flags,
+ const bool force_as_is_display)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class defined by the other arguments:
+ * 'bitmap' points to the bitmap, or NULL if to ignore that.
+ * 'nonbitmap_invlist' is an inversion list of the code points that are in
+ * the bitmap range, but for some reason aren't in the bitmap; NULL if
+ * none. The reasons for this could be that they require some
+ * condition such as the target string being or not being in UTF-8
+ * (under /d), or because they came from a user-defined property that
+ * was not resolved at the time of the regex compilation (under /u)
+ * 'only_utf8_locale_invlist' is an inversion list of the code points that
+ * are valid only if the runtime locale is a UTF-8 one; NULL if none
+ * 'node' is the regex pattern ANYOF node. It is needed only when the
+ * above two parameters are not null, and is passed so that this
+ * routine can tease apart the various reasons for them.
+ * 'flags' is the flags field of 'node'
+ * 'force_as_is_display' is TRUE if this routine should definitely NOT try
+ * to invert things to see if that leads to a cleaner display. If
+ * FALSE, this routine is free to use its judgment about doing this.
+ *
+ * It returns 0 if nothing was actually output. (It may be that
+ * the bitmap, etc is empty.)
+ * 1 if the output wasn't inverted (didn't begin with a '^')
+ * 2 if the output was inverted (did begin with a '^')
+ *
+ * When called for outputting the bitmap of a non-ANYOF node, just pass the
+ * bitmap, with the succeeding parameters set to NULL, and the final one to
+ * FALSE.
+ */
+
+ /* In general, it tries to display the 'cleanest' representation of the
+ * innards, choosing whether to display them inverted or not, regardless of
+ * whether the class itself is to be inverted. However, there are some
+ * cases where it can't try inverting, as what actually matches isn't known
+ * until runtime, and hence the inversion isn't either. */
+
+ bool inverting_allowed = ! force_as_is_display;
+
+ int i;
+ STRLEN orig_sv_cur = SvCUR(sv);
+
+ SV* invlist; /* Inversion list we accumulate of code points that
+ are unconditionally matched */
+ SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
+ UTF-8 */
+ SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
+ */
+ SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
+ SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
+ is UTF-8 */
+
+ SV* as_is_display; /* The output string when we take the inputs
+ literally */
+ SV* inverted_display; /* The output string when we invert the inputs */
+
+ bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
+ to match? */
+ /* We are biased in favor of displaying things without them being inverted,
+ * as that is generally easier to understand */
+ const int bias = 5;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ /* Start off with whatever code points are passed in. (We clone, so we
+ * don't change the caller's list) */
+ if (nonbitmap_invlist) {
+ assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
+ invlist = invlist_clone(nonbitmap_invlist, NULL);
+ }
+ else { /* Worst case size is every other code point is matched */
+ invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+ }
+
+ if (flags) {
+ if (OP(node) == ANYOFD) {
+
+ /* This flag indicates that the code points below 0x100 in the
+ * nonbitmap list are precisely the ones that match only when the
+ * target is UTF-8 (they should all be non-ASCII). */
+ if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
+ _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
+ _invlist_subtract(invlist, only_utf8, &invlist);
+ }
+
+ /* And this flag for matching all non-ASCII 0xFF and below */
+ if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
+ not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
+ }
+ }
+ else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
+
+ /* If either of these flags are set, what matches isn't
+ * determinable except during execution, so don't know enough here
+ * to invert */
+ if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
+ inverting_allowed = FALSE;
+ }
+
+ /* What the posix classes match also varies at runtime, so these
+ * will be output symbolically. */
+ if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
+ int i;
+
+ posixes = newSVpvs("");
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+ if (ANYOF_POSIXL_TEST(node, i)) {
+ sv_catpv(posixes, anyofs[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /* Accumulate the bit map into the unconditional match list */
+ if (bitmap) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ int start = i++;
+ for (;
+ i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
+ i++)
+ { /* empty */ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
+ }
+ }
+ }
+
+ /* Make sure that the conditional match lists don't have anything in them
+ * that match unconditionally; otherwise the output is quite confusing.
+ * This could happen if the code that populates these misses some
+ * duplication. */
+ if (only_utf8) {
+ _invlist_subtract(only_utf8, invlist, &only_utf8);
+ }
+ if (not_utf8) {
+ _invlist_subtract(not_utf8, invlist, &not_utf8);
+ }
+
+ if (only_utf8_locale_invlist) {
+
+ /* Since this list is passed in, we have to make a copy before
+ * modifying it */
+ only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
+
+ _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
+
+ /* And, it can get really weird for us to try outputting an inverted
+ * form of this list when it has things above the bitmap, so don't even
+ * try */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ inverting_allowed = FALSE;
+ }
+ }
+
+ /* Calculate what the output would be if we take the input as-is */
+ as_is_display = put_charclass_bitmap_innards_common(invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale,
+ invert);
+
+ /* If have to take the output as-is, just do that */
+ if (! inverting_allowed) {
+ if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ SvREFCNT_dec_NN(as_is_display);
+ }
+ }
+ else { /* But otherwise, create the output again on the inverted input, and
+ use whichever version is shorter */
+
+ int inverted_bias, as_is_bias;
+
+ /* We will apply our bias to whichever of the results doesn't have
+ * the '^' */
+ bool trial_invert;
+ if (invert) {
+ trial_invert = FALSE;
+ as_is_bias = bias;
+ inverted_bias = 0;
+ }
+ else {
+ trial_invert = TRUE;
+ as_is_bias = 0;
+ inverted_bias = bias;
+ }
+
+ /* Now invert each of the lists that contribute to the output,
+ * excluding from the result things outside the possible range */
+
+ /* For the unconditional inversion list, we have to add in all the
+ * conditional code points, so that when inverted, they will be gone
+ * from it */
+ _invlist_union(only_utf8, invlist, &invlist);
+ _invlist_union(not_utf8, invlist, &invlist);
+ _invlist_union(only_utf8_locale, invlist, &invlist);
+ _invlist_invert(invlist);
+ _invlist_intersection(invlist, PL_InBitmap, &invlist);
+
+ if (only_utf8) {
+ _invlist_invert(only_utf8);
+ _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
+ }
+ else if (not_utf8) {
+
+ /* If a code point matches iff the target string is not in UTF-8,
+ * then complementing the result has it not match iff not in UTF-8,
+ * which is the same thing as matching iff it is UTF-8. */
+ only_utf8 = not_utf8;
+ not_utf8 = NULL;
+ }
+
+ if (only_utf8_locale) {
+ _invlist_invert(only_utf8_locale);
+ _invlist_intersection(only_utf8_locale,
+ PL_InBitmap,
+ &only_utf8_locale);
+ }
+
+ inverted_display = put_charclass_bitmap_innards_common(
+ invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale, trial_invert);
+
+ /* Use the shortest representation, taking into account our bias
+ * against showing it inverted */
+ if ( inverted_display
+ && ( ! as_is_display
+ || ( SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)))
+ {
+ sv_catsv(sv, inverted_display);
+ invert = ! invert;
+ }
+ else if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ }
+
+ SvREFCNT_dec(as_is_display);
+ SvREFCNT_dec(inverted_display);
+ }
+
+ SvREFCNT_dec_NN(invlist);
+ SvREFCNT_dec(only_utf8);
+ SvREFCNT_dec(not_utf8);
+ SvREFCNT_dec(posixes);
+ SvREFCNT_dec(only_utf8_locale);
+
+ U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
+ if (did_output_something) {
+ /* Distinguish between non and inverted cases */
+ did_output_something += invert;
+ }
+
+ return did_output_something;
+}
+
+
+const regnode *
+Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
+ const regnode *last, const regnode *plast,
+ SV* sv, I32 indent, U32 depth)
+{
+ const regnode *next;
+ const regnode *optstart= NULL;
+
+ RXi_GET_DECL(r, ri);
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
+
+ PERL_ARGS_ASSERT_DUMPUNTIL;
+
+#ifdef DEBUG_DUMPUNTIL
+ Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
+ last ? last-start : 0, plast ? plast-start : 0);
+#endif
+
+ if (plast && plast < last)
+ last= plast;
+
+ while (node && (!last || node < last)) {
+ const U8 op = OP(node);
+
+ if (op == CLOSE || op == SRCLOSE || op == WHILEM)
+ indent--;
+ next = regnext((regnode *)node);
+ const regnode *after = regnode_after((regnode *)node,0);
+
+ /* Where, what. */
+ if (op == OPTIMIZED) {
+ if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
+ optstart = node;
+ else
+ goto after_print;
+ } else
+ CLEAR_OPTSTART;
+
+ regprop(r, sv, node, NULL, NULL);
+ Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
+ (int)(2*indent + 1), "", SvPVX_const(sv));
+
+ if (op != OPTIMIZED) {
+ if (next == NULL) /* Next ptr. */
+ Perl_re_printf( aTHX_ " (0)");
+ else if (REGNODE_TYPE(op) == BRANCH
+ && REGNODE_TYPE(OP(next)) != BRANCH )
+ Perl_re_printf( aTHX_ " (FAIL)");
+ else
+ Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
+ Perl_re_printf( aTHX_ "\n");
+ }
+
+ after_print:
+ if (REGNODE_TYPE(op) == BRANCHJ) {
+ assert(next);
+ const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
+ if (last && nnode > last)
+ nnode = last;
+ DUMPUNTIL(after, nnode);
+ }
+ else if (REGNODE_TYPE(op) == BRANCH) {
+ assert(next);
+ DUMPUNTIL(after, next);
+ }
+ else if ( REGNODE_TYPE(op) == TRIE ) {
+ const regnode *this_trie = node;
+ const U32 n = ARG(node);
+ const reg_ac_data * const ac = op>=AHOCORASICK ?
+ (reg_ac_data *)ri->data->data[n] :
+ NULL;
+ const reg_trie_data * const trie =
+ (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
+#ifdef DEBUGGING
+ AV *const trie_words
+ = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
+#endif
+ const regnode *nextbranch= NULL;
+ I32 word_idx;
+ SvPVCLEAR(sv);
+ for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
+ SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
+
+ Perl_re_indentf( aTHX_ "%s ",
+ indent+3,
+ elem_ptr
+ ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
+ SvCUR(*elem_ptr), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
+ (SvUTF8(*elem_ptr)
+ ? PERL_PV_ESCAPE_UNI
+ : 0)
+ | PERL_PV_PRETTY_ELLIPSES
+ | PERL_PV_PRETTY_LTGT
+ )
+ : "???"
+ );
+ if (trie->jump) {
+ U16 dist= trie->jump[word_idx+1];
+ Perl_re_printf( aTHX_ "(%" UVuf ")\n",
+ (UV)((dist ? this_trie + dist : next) - start));
+ if (dist) {
+ if (!nextbranch)
+ nextbranch= this_trie + trie->jump[0];
+ DUMPUNTIL(this_trie + dist, nextbranch);
+ }
+ if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
+ nextbranch= regnext((regnode *)nextbranch);
+ } else {
+ Perl_re_printf( aTHX_ "\n");
+ }
+ }
+ if (last && next > last)
+ node= last;
+ else
+ node= next;
+ }
+ else if ( op == CURLY ) { /* "next" might be very big: optimizer */
+ DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
+ }
+ else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
+ assert(next);
+ DUMPUNTIL(after, next);
+ }
+ else if ( op == PLUS || op == STAR) {
+ DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
+ }
+ else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
+ /* Literal string, where present. */
+ node = (const regnode *)REGNODE_AFTER_varies(node);
+ }
+ else {
+ node = REGNODE_AFTER_opcode(node,op);
+ }
+ if (op == CURLYX || op == OPEN || op == SROPEN)
+ indent++;
+ if (REGNODE_TYPE(op) == END)
+ break;
+ }
+ CLEAR_OPTSTART;
+#ifdef DEBUG_DUMPUNTIL
+ Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
+#endif
+ return node;
+}
+
+#endif /* DEBUGGING */