diff options
author | Karl Williamson <khw@cpan.org> | 2015-03-16 22:38:20 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-03-18 16:14:37 -0600 |
commit | b6d67071cc036ae5056dfe9b570ba76942fc08f4 (patch) | |
tree | fa13bf0bff4cbc76df8df602b33174470e3ab31a /regcomp.c | |
parent | 7a4ca5b4c6cbf0022494a8f350fe000abb4b3034 (diff) | |
download | perl-b6d67071cc036ae5056dfe9b570ba76942fc08f4.tar.gz |
Fix qr'\N{U+41}' on EBCDIC platforms
Prior to this commit, the regex compiler was relying on the lexer to do
the translation from Unicode to native for \N{...} constructs, where it
was simpler to do. However, when the pattern is a single-quoted string,
it is passed unchanged to the regex compiler, and did not work. Fixing
it required some refactoring, though it led to a clean API in a static
function.
This was spotted by Father Chrysostomos.
Diffstat (limited to 'regcomp.c')
-rw-r--r-- | regcomp.c | 459 |
1 files changed, 263 insertions, 196 deletions
@@ -180,6 +180,9 @@ struct RExC_state_t { I32 contains_locale; I32 contains_i; I32 override_recoding; +#ifdef EBCDIC + I32 recode_x_to_native; +#endif I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ @@ -255,6 +258,9 @@ struct RExC_state_t { #define RExC_contains_locale (pRExC_state->contains_locale) #define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) +#ifdef EBCDIC +# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) +#endif #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define RExC_frame_head (pRExC_state->frame_head) #define RExC_frame_last (pRExC_state->frame_last) @@ -6629,6 +6635,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif RExC_in_multi_char_class = 0; /* First pass: determine size, legality. */ @@ -11018,95 +11027,94 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } -STATIC STRLEN -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, - UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, + regnode ** node_p, + UV * code_point_p, + int * cp_count, + I32 * flagp, + const U32 depth ) { - - /* This is expected to be called by a parser routine that has recognized '\N' - and needs to handle the rest. RExC_parse is expected to point at the first - char following the N at the time of the call. On successful return, - RExC_parse has been updated to point to just after the sequence identified - by this routine, <*flagp> has been updated, and the non-NULL input pointers - have been set appropriately. - - The typical case for this is \N{some character name}. This is usually - called while parsing the input, filling in or ready to fill in an EXACTish - node, and the code point for the character should be returned, so that it - can be added to the node, and parsing continued with the next input - character. But it may be that instead of a single character the \N{} - expands to more than one, a named sequence. In this case any following - quantifier applies to the whole sequence, and it is easier, given the code - structure that calls this, to handle it from a different area of the code. - For this reason, the input parameters can be set so that it returns valid - only on one or the other of these cases. - - Another possibility is for the input to be an empty \N{}, which for - backwards compatibility we accept, but generate a NOTHING node which should - later get optimized out. This is handled from the area of code which can - handle a named sequence, so if called with the parameters for the other, it - fails. - - Still another possibility is for the \N to mean [^\n], and not a single - character or explicit sequence at all. This is determined by context. - Again, this is handled from the area of code which can handle a named - sequence, so if called with the parameters for the other, it also fails. - - And the final possibility is for the \N to be called from within a bracketed - character class. In this case the [^\n] meaning makes no sense, and so is - an error. Other anomalous situations are left to the calling code to handle. - - For non-single-quoted regexes, the tokenizer has attempted to decide which - of the above applies, and in the case of a named sequence, has converted it - into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, - where c1... are the characters in the sequence. For single-quoted regexes, - the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those, instead raising a syntax error. - The net effect is that if the beginning of the passed-in pattern isn't '{U+' - or there is no '}', it signals that this \N occurrence means to match a - non-newline. (This mostly was done because of [perl #56444].) - - The API is somewhat convoluted due to historical and the above reasons. - - The function raises an error (via vFAIL), and doesn't return for various - syntax errors. For other failures, it returns (STRLEN) -1. For successes, - it returns a count of how many characters were accounted for by it. (This - can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code - points in the sequence. It sets <node_p>, <valuep>, and/or - <substitute_parse> on success. - - If <valuep> is non-null, it means the caller can accept an input sequence - consisting of just a single code point; <*valuep> is set to the value of the - only or first code point in the input. - - If <substitute_parse> is non-null, it means the caller can accept an input - sequence consisting of one or more code points; <*substitute_parse> is a - newly created mortal SV* in this case, containing \x{} escapes representing - those code points. - - Both <valuep> and <substitute_parse> can be non-NULL. - - If <node_p> is non-null, <substitute_parse> must be NULL. This signifies - that the caller can accept any legal sequence other than a single code - point. To wit, <*node_p> is set as follows: - 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1 - 2) \N{}: points to a new NOTHING node; return is 0 - 3) otherwise: points to a new EXACT node containing the resolved - string; return is the number of code points in the - string. This will never be 1. - Note that failure is returned for single code point sequences if <valuep> is - null and <node_p> is not. - */ - - char * endbrace; /* '}' following the name */ - char* p; + /* This routine teases apart the various meanings of \N and returns + * accordingly. The input parameters constrain which meaning(s) is/are valid + * in the current context. + * + * Exactly one of <node_p> and <code_point_p> must be non-NULL. + * + * If <code_point_p> is not NULL, the context is expecting the result to be a + * single code point. If this \N instance turns out to a single code point, + * the function returns TRUE and sets *code_point_p to that code point. + * + * If <node_p> is not NULL, the context is expecting the result to be one of + * the things representable by a regnode. If this \N instance turns out to be + * one such, the function generates the regnode, returns TRUE and sets *node_p + * to point to that regnode. + * + * If this instance of \N isn't legal in any context, this function will + * generate a fatal error and not return. + * + * On input, RExC_parse should point to the first char following the \N at the + * time of the call. On successful return, RExC_parse will have been updated + * to point to just after the sequence identified by this routine. Also + * *flagp has been updated as needed. + * + * When there is some problem with the current context and this \N instance, + * the function returns FALSE, without advancing RExC_parse, nor setting + * *node_p, nor *code_point_p, nor *flagp. + * + * If <cp_count> is not NULL, the caller wants to know the length (in code + * points) that this \N sequence matches. This is set even if the function + * returns FALSE, as detailed below. + * + * There are 5 possibilities here, as detailed in the next 5 paragraphs. + * + * Probably the most common case is for the \N to specify a single code point. + * *cp_count will be set to 1, and *code_point_p will be set to that code + * point. + * + * Another possibility is for the input to be an empty \N{}, which for + * backwards compatibility we accept. *cp_count will be set to 0. *node_p + * will be set to a generated NOTHING node. + * + * Still another possibility is for the \N to mean [^\n]. *cp_count will be + * set to 0. *node_p will be set to a generated REG_ANY node. + * + * The fourth possibility is that \N resolves to a sequence of more than one + * code points. *cp_count will be set to the number of code points in the + * sequence. *node_p * will be set to a generated node returned by this + * function calling S_reg(). + * + * The final possibility, which happens only when the fourth one would + * otherwise be in effect, is that one of those code points requires the + * pattern to be recompiled as UTF-8. The function returns FALSE, and sets + * the RESTART_UTF8 flag in *flagp. When this happens, the caller needs to + * desist from continuing parsing, and return this information to its caller. + * This is not set for when there is only one code point, as this can be + * called as part of an ANYOF node, and they can store above-Latin1 code + * points without the pattern having to be in UTF-8. + * + * For non-single-quoted regexes, the tokenizer has resolved character and + * sequence names inside \N{...} into their Unicode values, normalizing the + * result into what we should see here: '\N{U+c1.c2...}', where c1... are the + * hex-represented code points in the sequence. This is done there because + * the names can vary based on what charnames pragma is in scope at the time, + * so we need a way to take a snapshot of what they resolve to at the time of + * the original parse. [perl #56444]. + * + * That parsing is skipped for single-quoted regexes, so we may here get + * '\N{NAME}'. This is a fatal error. These names have to be resolved by the + * parser. But if the single-quoted regex is something like '\N{U+41}', that + * is legal and handled here. The code point is Unicode, and has to be + * translated into the native character set for non-ASCII platforms. + * the tokenizer passes the \N sequence through unchanged; this code will not + * attempt to determine this nor expand those, instead raising a syntax error. + */ + + char * endbrace; /* points to '}' following the name */ char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ - bool has_multiple_chars; /* true if the input stream contains a sequence of - more than one character */ - bool in_char_class = substitute_parse != NULL; - STRLEN count = 0; /* Number of characters in this sequence */ + char* p; /* Temporary */ GET_RE_DEBUG_FLAGS_DECL; @@ -11114,11 +11122,15 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, GET_RE_DEBUG_FLAGS; - assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ - assert(! (node_p && substitute_parse)); /* At most 1 should be set */ + assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ + assert(! (node_p && cp_count)); /* At most 1 should be set */ + + if (cp_count) { /* Initialize return for the most common case */ + *cp_count = 1; + } /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not, so use a temporary until we find + * modifier. The other meanings do not, so use a temporary until we find * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regpatws(pRExC_state, RExC_parse, @@ -11126,15 +11138,16 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning - * [^\n]. The former is assumed when it can't be the latter. */ + * [^\n]. The latter is assumed when the {...} following the \N is a legal + * quantifier, or there is no a '{' at all */ if (*p != '{' || regcurly(p)) { RExC_parse = p; + if (cp_count) { + *cp_count = -1; + } + if (! node_p) { - /* no bare \N allowed in a charclass */ - if (in_char_class) { - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } - return (STRLEN) -1; + return FALSE; } RExC_parse--; /* Need to back off so nextchar() doesn't skip the current char */ @@ -11143,7 +11156,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); Set_Node_Length(*node_p, 1); /* MJD */ - return 1; + return TRUE; } /* Here, we have decided it should be a named character or sequence */ @@ -11171,14 +11184,16 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ if (endbrace == RExC_parse) { /* empty: \N{} */ - if (node_p) { - *node_p = reg_node(pRExC_state,NOTHING); - } - else if (! in_char_class) { - return (STRLEN) -1; + if (cp_count) { + *cp_count = 0; } nextchar(pRExC_state); - return 0; + if (! node_p) { + return FALSE; + } + + *node_p = reg_node(pRExC_state,NOTHING); + return TRUE; } RExC_parse += 2; /* Skip past the 'U+' */ @@ -11187,28 +11202,40 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, /* Code points are separated by dots. If none, there is only one code * point, and is terminated by the brace */ - has_multiple_chars = (endchar < endbrace); - /* We get the first code point if we want it, and either there is only one, - * or we can accept both cases of one and there is more than one */ - if (valuep && (substitute_parse || ! has_multiple_chars)) { - STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); - I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + if (endchar >= endbrace) { + STRLEN length_of_hex; + I32 grok_hex_flags; + + /* Here, exactly one code point. If that isn't what is wanted, fail */ + if (! code_point_p) { + RExC_parse = p; + return FALSE; + } + + /* Convert code point from hex */ + length_of_hex = (STRLEN)(endchar - RExC_parse); + grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX /* No errors in the first pass (See [perl * #122671].) We let the code below find the * errors when there are multiple chars. */ - | ((SIZE_ONLY || has_multiple_chars) + | ((SIZE_ONLY) ? PERL_SCAN_SILENT_ILLDIGIT : 0); - *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + /* This routine is the one place where both single- and double-quotish + * \N{U+xxxx} are evaluated. The value is a Unicode code point which + * must be converted to native. */ + *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse, + &length_of_hex, + &grok_hex_flags, + NULL)); /* The tokenizer should have guaranteed validity, but it's possible to * bypass it by using single quoting, so check. Don't do the check * here when there are multiple chars; we do it below anyway. */ - if (! has_multiple_chars) { if (length_of_hex == 0 || length_of_hex != (STRLEN)(endchar - RExC_parse) ) { @@ -11224,79 +11251,79 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, } RExC_parse = endbrace + 1; - return 1; - } - } - - /* Here, we should have already handled the case where a single character - * is expected and found. So it is a failure if we aren't expecting - * multiple chars and got them; or didn't get them but wanted them. We - * fail without advancing the parse, so that the caller can try again with - * different acceptance criteria */ - if ((! node_p && ! substitute_parse) || ! has_multiple_chars) { - RExC_parse = p; - return (STRLEN) -1; + return TRUE; } - - { - /* What is done here is to convert this to a sub-pattern of the form - * \x{char1}\x{char2}... - * and then either return it in <*substitute_parse> if non-null; or - * call reg recursively to parse it (enclosing in "(?: ... )" ). That - * way, it retains its atomicness, while not having to worry about - * special handling that some code points may have. toke.c has - * converted the original Unicode values to native, so that we can just - * pass on the hex values unchanged. We do have to set a flag to keep - * recoding from happening in the recursion */ - - SV * dummy = NULL; + else { /* Is a multiple character sequence */ + SV * substitute_parse; STRLEN len; char *orig_end = RExC_end; I32 flags; - if (substitute_parse) { - *substitute_parse = newSVpvs(""); + /* Count the code points, if desired, in the sequence */ + if (cp_count) { + *cp_count = 0; + while (RExC_parse < endbrace) { + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + (*cp_count)++; + } } - else { - substitute_parse = &dummy; - *substitute_parse = newSVpvs("?:"); + + /* Fail if caller doesn't want to handle a multi-code-point sequence. + * But don't backup up the pointer if the caller want to know how many + * code points there are (they can then handle things) */ + if (! node_p) { + if (! cp_count) { + RExC_parse = p; + } + return FALSE; } - *substitute_parse = sv_2mortal(*substitute_parse); + + /* What is done here is to convert this to a sub-pattern of the form + * \x{char1}\x{char2}... and then call reg recursively to parse it + * (enclosing in "(?: ... )" ). That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. */ + + substitute_parse = newSVpvs("?:"); while (RExC_parse < endbrace) { /* Convert to notation the rest of the code understands */ - sv_catpv(*substitute_parse, "\\x{"); - sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse); - sv_catpv(*substitute_parse, "}"); + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; endchar = RExC_parse + strcspn(RExC_parse, ".}"); - count++; } - if (! in_char_class) { - sv_catpv(*substitute_parse, ")"); - } + sv_catpv(substitute_parse, ")"); - RExC_parse = SvPV(*substitute_parse, len); + RExC_parse = SvPV(substitute_parse, len); /* Don't allow empty number */ - if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) { + if (len < (STRLEN) 8) { RExC_parse = endbrace; vFAIL("Invalid hexadecimal number in \\N{U+...}"); } RExC_end = RExC_parse + len; - /* The values are Unicode, and therefore not subject to recoding */ + /* The values are Unicode, and therefore not subject to recoding, but + * have to be converted to native on a non-Unicode (meaning non-ASCII) + * platform. */ RExC_override_recoding = 1; +#ifdef EBCDIC + RExC_recode_x_to_native = 1; +#endif if (node_p) { if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { if (flags & RESTART_UTF8) { *flagp = RESTART_UTF8; - return (STRLEN) -1; + return FALSE; } FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", (UV) flags); @@ -11304,14 +11331,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); } + /* Restore the saved values */ RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif + SvREFCNT_dec_NN(substitute_parse); nextchar(pRExC_state); - } - return count; + return TRUE; + } } @@ -11995,26 +12027,35 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } break; case 'N': - /* Handle \N and \N{NAME} with multiple code points here and not - * below because it can be multicharacter. join_exact() will join - * them up later on. Also this makes sure that things like - * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. - * The options to the grok function call causes it to fail if the - * sequence is just a single code point. We then go treat it as - * just another character in the current EXACT node, and hence it - * gets uniform treatment with all the other characters. The - * special treatment for quantifiers is not needed for such single - * character sequences */ + /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the + * \N{...} evaluates to a sequence of more than one code points). + * The function call below returns a regnode, which is our result. + * The parameters cause it to fail if the \N{} evaluates to a + * single code point; we handle those like any other literal. The + * reason that the multicharacter case is handled here and not as + * part of the EXACtish code is because of quantifiers. In + * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it + * this way makes that Just Happen. dmq. + * join_exact() will join this up with adjacent EXACTish nodes + * later on, if appropriate. */ ++RExC_parse; - if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp, - depth, FALSE)) - { - if (*flagp & RESTART_UTF8) - return NULL; - RExC_parse--; - goto defchar; + if (grok_bslash_N(pRExC_state, + &ret, /* Want a regnode returned */ + NULL, /* Fail if evaluates to a single code + point */ + NULL, /* Don't need a count of how many code + points */ + flagp, + depth) + ) { + break; } - break; + + if (*flagp & RESTART_UTF8) + return NULL; + RExC_parse--; + goto defchar; + case 'k': /* Handle \k<NAME> and \k'NAME' */ parse_named_seq: { @@ -12323,18 +12364,24 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) p++; break; case 'N': /* Handle a single-code point named character. */ - /* The options cause it to fail if a multiple code - * point sequence. Handle those in the switch() above - * */ RExC_parse = p + 1; - if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL, - &ender, - flagp, - depth, - FALSE - )) { + if (! grok_bslash_N(pRExC_state, + NULL, /* Fail if evaluates to + anything other than a + single code point */ + &ender, /* The returned single code + point */ + NULL, /* Don't need a count of + how many code points */ + flagp, + depth) + ) { if (*flagp & RESTART_UTF8) FAIL("panic: grok_bslash_N set RESTART_UTF8"); + + /* Here, it wasn't a single code point. Go close + * up this EXACTish node. The switch() prior to + * this switch handles the other cases */ RExC_parse = p = oldp; goto loopdone; } @@ -12413,10 +12460,18 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ender = result; - if (IN_ENCODING && ender < 0x100) { - goto recode_encoding; + if (ender < 0x100) { +#ifdef EBCDIC + if (RExC_recode_x_to_native) { + ender = LATIN1_TO_NATIVE(ender); + } + else +#endif + if (IN_ENCODING) { + goto recode_encoding; + } } - if (ender > 0xff) { + else { REQUIRE_UTF8; } break; @@ -14092,14 +14147,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - SV *as_text; - STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value, - flagp, depth, &as_text); - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); - if (cp_count != 1) { /* The typical case drops through */ - assert(cp_count != (STRLEN) -1); - if (cp_count == 0) { + const char * const backslash_N_beg = RExC_parse - 2; + int cp_count; + + if (! grok_bslash_N(pRExC_state, + NULL, /* No regnode */ + &value, /* Yes single value */ + &cp_count, /* Multiple code pt count */ + flagp, + depth) + ) { + + if (*flagp & RESTART_UTF8) + FAIL("panic: grok_bslash_N set RESTART_UTF8"); + + if (cp_count < 0) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + else if (cp_count == 0) { if (strict) { RExC_parse++; /* Position after the "}" */ vFAIL("Zero length \\N{}"); @@ -14119,16 +14184,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else if (PASS2) { ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); } + break; /* <value> contains the first code + point. Drop out of the switch to + process it */ } else { + SV * multi_char_N = newSVpvn(backslash_N_beg, + RExC_parse - backslash_N_beg); multi_char_matches = add_multi_match(multi_char_matches, - as_text, + multi_char_N, cp_count); } - break; /* <value> contains the first code - point. Drop out of the switch to - process it */ } } /* End of cp_count != 1 */ |