diff options
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 215 |
1 files changed, 194 insertions, 21 deletions
@@ -115,9 +115,11 @@ typedef struct RExC_state_t { U16 flags16; /* are we folding, multilining? */ char *precomp; /* uncompiled string. */ regexp *rx; + char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ @@ -137,10 +139,13 @@ typedef struct RExC_state_t { #define RExC_flags16 (pRExC_state->flags16) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx (pRExC_state->rx) +#define RExC_start (pRExC_state->start) #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) +#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */ #define RExC_emit (pRExC_state->emit) +#define RExC_emit_start (pRExC_state->emit_start) #define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) @@ -417,6 +422,50 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Allow for side effects in s */ #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END +/* Macros for recording node offsets. 20001227 mjd@plover.com + * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in + * element 2*n-1 of the array. Element #2n holds the byte length node #n. + * Element 0 holds the number n. + */ + +#define MJD_OFFSET_DEBUG(x) +/* #define MJD_OFFSET_DEBUG(x) fprintf x */ + + +# define Set_Node_Offset_To_R(node,byte) \ + STMT_START { \ + if (! SIZE_ONLY) { \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \ + } else { \ + RExC_offsets[2*(node)-1] = (byte); \ + } \ + } \ + } STMT_END + +# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start) +# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) + +# define Set_Node_Length_To_R(node,len) \ + STMT_START { \ + if (! SIZE_ONLY) { \ + MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \ + if((node) < 0) { \ + Perl_croak(aTHX_ "value of node is %d in Length macro", node); \ + } else { \ + RExC_offsets[2*(node)] = (len); \ + } \ + } \ + } STMT_END + +# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len) +# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len) +# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start) + +/* Get offsets and lengths */ +#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) +#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) + static void clear_re(pTHXo_ void *r); /* Mark that we cannot extend a found fixed substring at this point. @@ -1614,6 +1663,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* First pass: determine size, legality. */ RExC_parse = exp; + RExC_start = exp; RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; @@ -1660,6 +1710,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->startp = 0; /* Useful during FAIL. */ r->endp = 0; /* Useful during FAIL. */ + Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ + if (r->offsets) { + r->offsets[0] = RExC_size; + } + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%s %u bytes for offset annotations.\n", + r->offsets ? "Got" : "Couldn't get", + (2*RExC_size+1) * sizeof(U32))); + RExC_rx = r; /* Second pass: emit code. */ @@ -1667,6 +1726,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; + RExC_emit_start = r->program; RExC_emit = r->program; /* Store the count of eval-groups for security checks: */ RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); @@ -1957,6 +2017,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) register regnode *ender = 0; register I32 parno = 0; I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0; + char * parse_start = RExC_parse; /* MJD */ char *oregcomp_parse = RExC_parse; char c; @@ -1964,7 +2025,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* Make an OPEN node, if parenthesized. */ if (paren) { - if (*RExC_parse == '?') { + if (*RExC_parse == '?') { /* (?...) */ U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; int logical = 0; @@ -1974,24 +2035,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) paren = *RExC_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { - case '<': + case '<': /* (?<...) */ RExC_seen |= REG_SEEN_LOOKBEHIND; if (*RExC_parse == '!') paren = ','; if (*RExC_parse != '=' && *RExC_parse != '!') goto unknown; RExC_parse++; - case '=': - case '!': + case '=': /* (?=...) */ + case '!': /* (?!...) */ RExC_seen_zerolen++; - case ':': - case '>': + case ':': /* (?:...) */ + case '>': /* (?>...) */ break; - case '$': - case '@': + case '$': /* (?$...) */ + case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': + case '#': /* (?#...) */ while (*RExC_parse && *RExC_parse != ')') RExC_parse++; if (*RExC_parse != ')') @@ -1999,15 +2060,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; - case 'p': + case 'p': /* (?p...) */ if (SIZE_ONLY) vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})"); /* FALL THROUGH*/ - case '?': + case '?': /* (??...) */ logical = 1; paren = *RExC_parse++; /* FALL THROUGH */ - case '{': + case '{': /* (?{...}) */ { I32 count = 1, n = 0; char c; @@ -2066,13 +2127,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) if (!SIZE_ONLY) ret->flags = 2; regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* deal with the length of this later - MJD */ return ret; } return reganode(pRExC_state, EVAL, n); } - case '(': + case '(': /* (?(?{...})...) and (?(?=...)...) */ { - if (RExC_parse[0] == '?') { + if (RExC_parse[0] == '?') { /* (?(?...)) */ if (RExC_parse[1] == '=' || RExC_parse[1] == '!' || RExC_parse[1] == '<' || RExC_parse[1] == '{') { /* Lookahead or eval. */ @@ -2086,11 +2148,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) } } else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + /* (?(1)...) */ parno = atoi(RExC_parse++); while (isDIGIT(*RExC_parse)) RExC_parse++; - ret = reganode(pRExC_state, GROUPP, parno); + ret = reganode(pRExC_state, GROUPP, parno); + if ((c = *nextchar(pRExC_state)) != ')') vFAIL("Switch condition not recognized"); insert_if: @@ -2135,7 +2199,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) break; default: --RExC_parse; - parse_flags: + parse_flags: /* (?i) */ while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { if (*RExC_parse != 'o') pmflag(flagsp, *RExC_parse); @@ -2163,26 +2227,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) return NULL; } } - else { + else { /* (...) */ parno = RExC_npar; RExC_npar++; ret = reganode(pRExC_state, OPEN, parno); + Set_Node_Length(ret, 1); /* MJD */ + Set_Node_Offset(ret, RExC_parse); /* MJD */ open = 1; } } - else + else /* ! paren */ ret = NULL; /* Pick up the branches, linking them together. */ + parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1); + /* branch_len = (paren != 0); */ + if (br == NULL) return(NULL); if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { reginsert(pRExC_state, BRANCHJ, br); } - else + else { /* MJD */ reginsert(pRExC_state, BRANCH, br); + Set_Node_Length(br, paren != 0); + Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start); + } have_branch = 1; if (SIZE_ONLY) RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ @@ -2208,6 +2280,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); br = regbranch(pRExC_state, &flags, 0); + if (br == NULL) return(NULL); regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ @@ -2225,6 +2298,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) break; case 1: ender = reganode(pRExC_state, CLOSE, parno); + Set_Node_Offset(ender,RExC_parse+1); /* MJD */ + Set_Node_Length(ender,1); /* MJD */ break; case '<': case ',': @@ -2304,8 +2379,10 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) else { if (!SIZE_ONLY && RExC_extralen) ret = reganode(pRExC_state, BRANCHJ,0); - else + else { ret = reg_node(pRExC_state, BRANCH); + Set_Node_Length(ret, 1); + } } if (!first && SIZE_ONLY) @@ -2367,6 +2444,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) char *maxpos; I32 min; I32 max = REG_INFTY; + char *parse_start; ret = regatom(pRExC_state, &flags); if (ret == NULL) { @@ -2378,6 +2456,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) op = *RExC_parse; if (op == '{' && regcurly(RExC_parse)) { + parse_start = RExC_parse; /* MJD */ next = RExC_parse + 1; maxpos = Nullch; while (isDIGIT(*next) || *next == ',') { @@ -2410,6 +2489,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) if ((flags&SIMPLE)) { RExC_naughty += 2 + RExC_naughty / 2; reginsert(pRExC_state, CURLY, ret); + Set_Node_Offset(ret, parse_start+1); /* MJD */ + Set_Node_Cur_Length(ret); } else { regnode *w = reg_node(pRExC_state, WHILEM); @@ -2422,6 +2503,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ } reginsert(pRExC_state, CURLYX,ret); + /* MJD hk */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Length(ret, + op == '{' ? (RExC_parse - parse_start) : 1); + if (!SIZE_ONLY && RExC_extralen) NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); @@ -2467,6 +2553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) vFAIL("Regexp *+ operand could be empty"); #endif + parse_start = RExC_parse; nextchar(pRExC_state); *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); @@ -2528,6 +2615,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) { register regnode *ret = 0; I32 flags; + char *parse_start = RExC_parse; *flagp = WORST; /* Tentatively. */ @@ -2542,6 +2630,7 @@ tryagain: ret = reg_node(pRExC_state, SBOL); else ret = reg_node(pRExC_state, BOL); + Set_Node_Length(ret, 1); /* MJD */ break; case '$': nextchar(pRExC_state); @@ -2553,6 +2642,7 @@ tryagain: ret = reg_node(pRExC_state, SEOL); else ret = reg_node(pRExC_state, EOL); + Set_Node_Length(ret, 1); /* MJD */ break; case '.': nextchar(pRExC_state); @@ -2562,6 +2652,7 @@ tryagain: ret = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; + Set_Node_Length(ret, 1); /* MJD */ break; case '[': { @@ -2573,6 +2664,7 @@ tryagain: } nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; + Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; } case '(': @@ -2619,12 +2711,14 @@ tryagain: ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'G': ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'Z': ret = reg_node(pRExC_state, SEOL); @@ -2636,27 +2730,32 @@ tryagain: *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'C': ret = reg_node(pRExC_state, SANY); RExC_seen |= REG_SEEN_SANY; *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'w': ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'W': ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'b': RExC_seen_zerolen++; @@ -2664,6 +2763,7 @@ tryagain: ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'B': RExC_seen_zerolen++; @@ -2671,31 +2771,37 @@ tryagain: ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 's': ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'S': ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'd': ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'D': ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(pRExC_state); + Set_Node_Length(ret, 2); /* MJD */ break; case 'p': case 'P': { /* a lovely hack--pretend we saw [\pX] instead */ char* oldregxend = RExC_end; + char* parse_start = RExC_parse; if (RExC_parse[1] == '{') { RExC_end = strchr(RExC_parse, '}'); @@ -2714,6 +2820,7 @@ tryagain: RExC_end = oldregxend; RExC_parse--; + Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; } @@ -2736,6 +2843,7 @@ tryagain: if (num > 9 && num >= RExC_npar) goto defchar; else { + char * parse_start = RExC_parse - 1; /* MJD */ while (isDIGIT(*RExC_parse)) RExC_parse++; @@ -2746,6 +2854,10 @@ tryagain: ? (LOC ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret); /* MJD */ RExC_parse--; nextchar(pRExC_state); } @@ -2776,6 +2888,7 @@ tryagain: register char *p; char *oldp, *s; STRLEN numlen; + char *parse_start = RExC_parse - 1; RExC_parse++; @@ -2941,6 +3054,7 @@ tryagain: } loopdone: RExC_parse = p - 1; + Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); { /* len is STRLEN which is unsigned, need to copy to signed */ @@ -3154,6 +3268,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) bool need_class = 0; SV *listsv; register char *e; + char *parse_start = RExC_parse; /* MJD */ UV n; bool dont_optimize_invert = FALSE; @@ -3867,6 +3982,18 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", + "reg_node", __LINE__, + reg_name[op], + RExC_emit - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + RExC_emit - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); + } + RExC_emit = ptr; return(ret); @@ -3891,6 +4018,17 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + "reganode", + RExC_emit - RExC_emit_start > RExC_offsets[0] ? + "Overwriting end of array!\n" : "OK", + RExC_emit - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Cur_Node_Offset; + } + RExC_emit = ptr; return(ret); @@ -3928,10 +4066,33 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) src = RExC_emit; RExC_emit += NODE_STEP_REGNODE + offset; dst = RExC_emit; - while (src > opnd) + while (src > opnd) { StructCopy(--src, --dst, regnode); + if (RExC_offsets) { /* MJD 20010112 */ + MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", + "reg_insert", + dst - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + src - RExC_emit_start, + dst - RExC_emit_start, + RExC_offsets[0])); + Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); + Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); + } + } + place = opnd; /* Op node, where operand used to be. */ + if (RExC_offsets) { /* MJD */ + MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", + "reginsert", + place - RExC_emit_start > RExC_offsets[0] + ? "Overwriting end of array!\n" : "OK", + place - RExC_emit_start, + RExC_parse - RExC_start, + RExC_offsets[0])); + Set_Node_Offset(place, RExC_parse); + } src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -4145,6 +4306,16 @@ Perl_regdump(pTHX_ regexp *r) if (r->reganch & ROPT_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + if (r->offsets) { + U32 i; + U32 len = r->offsets[0]; + PerlIO_printf(Perl_debug_log, "Offsets: [%u]\n\t", r->offsets[0]); + for (i = 1; i <= len; i++) + PerlIO_printf(Perl_debug_log, "%u[%u] ", + r->offsets[i*2-1], + r->offsets[i*2]); + PerlIO_printf(Perl_debug_log, "\n"); + } #endif /* DEBUGGING */ } @@ -4366,6 +4537,8 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->precomp) Safefree(r->precomp); + if (r->offsets) /* 20010421 MJD */ + Safefree(r->offsets); if (RX_MATCH_COPIED(r)) Safefree(r->subbeg); if (r->substrs) { |