diff options
Diffstat (limited to 'Zend/zend_language_parser.y')
-rw-r--r-- | Zend/zend_language_parser.y | 431 |
1 files changed, 255 insertions, 176 deletions
diff --git a/Zend/zend_language_parser.y b/Zend/zend_language_parser.y index aa3164b735..40938256f2 100644 --- a/Zend/zend_language_parser.y +++ b/Zend/zend_language_parser.y @@ -86,146 +86,146 @@ static YYSIZE_T zend_yytnamerr(char*, const char*); %precedence T_ELSEIF %precedence T_ELSE -%token <ast> T_LNUMBER "integer number (T_LNUMBER)" -%token <ast> T_DNUMBER "floating-point number (T_DNUMBER)" -%token <ast> T_STRING "identifier (T_STRING)" -%token <ast> T_VARIABLE "variable (T_VARIABLE)" +%token <ast> T_LNUMBER "integer" +%token <ast> T_DNUMBER "floating-point number" +%token <ast> T_STRING "identifier" +%token <ast> T_VARIABLE "variable" %token <ast> T_INLINE_HTML -%token <ast> T_ENCAPSED_AND_WHITESPACE "quoted-string and whitespace (T_ENCAPSED_AND_WHITESPACE)" -%token <ast> T_CONSTANT_ENCAPSED_STRING "quoted-string (T_CONSTANT_ENCAPSED_STRING)" -%token <ast> T_STRING_VARNAME "variable name (T_STRING_VARNAME)" -%token <ast> T_NUM_STRING "number (T_NUM_STRING)" - -%token <ident> T_INCLUDE "include (T_INCLUDE)" -%token <ident> T_INCLUDE_ONCE "include_once (T_INCLUDE_ONCE)" -%token <ident> T_EVAL "eval (T_EVAL)" -%token <ident> T_REQUIRE "require (T_REQUIRE)" -%token <ident> T_REQUIRE_ONCE "require_once (T_REQUIRE_ONCE)" -%token <ident> T_LOGICAL_OR "or (T_LOGICAL_OR)" -%token <ident> T_LOGICAL_XOR "xor (T_LOGICAL_XOR)" -%token <ident> T_LOGICAL_AND "and (T_LOGICAL_AND)" -%token <ident> T_PRINT "print (T_PRINT)" -%token <ident> T_YIELD "yield (T_YIELD)" -%token <ident> T_YIELD_FROM "yield from (T_YIELD_FROM)" -%token <ident> T_INSTANCEOF "instanceof (T_INSTANCEOF)" -%token <ident> T_NEW "new (T_NEW)" -%token <ident> T_CLONE "clone (T_CLONE)" -%token <ident> T_EXIT "exit (T_EXIT)" -%token <ident> T_IF "if (T_IF)" -%token <ident> T_ELSEIF "elseif (T_ELSEIF)" -%token <ident> T_ELSE "else (T_ELSE)" -%token <ident> T_ENDIF "endif (T_ENDIF)" -%token <ident> T_ECHO "echo (T_ECHO)" -%token <ident> T_DO "do (T_DO)" -%token <ident> T_WHILE "while (T_WHILE)" -%token <ident> T_ENDWHILE "endwhile (T_ENDWHILE)" -%token <ident> T_FOR "for (T_FOR)" -%token <ident> T_ENDFOR "endfor (T_ENDFOR)" -%token <ident> T_FOREACH "foreach (T_FOREACH)" -%token <ident> T_ENDFOREACH "endforeach (T_ENDFOREACH)" -%token <ident> T_DECLARE "declare (T_DECLARE)" -%token <ident> T_ENDDECLARE "enddeclare (T_ENDDECLARE)" -%token <ident> T_AS "as (T_AS)" -%token <ident> T_SWITCH "switch (T_SWITCH)" -%token <ident> T_ENDSWITCH "endswitch (T_ENDSWITCH)" -%token <ident> T_CASE "case (T_CASE)" -%token <ident> T_DEFAULT "default (T_DEFAULT)" -%token <ident> T_MATCH "match (T_MATCH)" -%token <ident> T_BREAK "break (T_BREAK)" -%token <ident> T_CONTINUE "continue (T_CONTINUE)" -%token <ident> T_GOTO "goto (T_GOTO)" -%token <ident> T_FUNCTION "function (T_FUNCTION)" -%token <ident> T_FN "fn (T_FN)" -%token <ident> T_CONST "const (T_CONST)" -%token <ident> T_RETURN "return (T_RETURN)" -%token <ident> T_TRY "try (T_TRY)" -%token <ident> T_CATCH "catch (T_CATCH)" -%token <ident> T_FINALLY "finally (T_FINALLY)" -%token <ident> T_THROW "throw (T_THROW)" -%token <ident> T_USE "use (T_USE)" -%token <ident> T_INSTEADOF "insteadof (T_INSTEADOF)" -%token <ident> T_GLOBAL "global (T_GLOBAL)" -%token <ident> T_STATIC "static (T_STATIC)" -%token <ident> T_ABSTRACT "abstract (T_ABSTRACT)" -%token <ident> T_FINAL "final (T_FINAL)" -%token <ident> T_PRIVATE "private (T_PRIVATE)" -%token <ident> T_PROTECTED "protected (T_PROTECTED)" -%token <ident> T_PUBLIC "public (T_PUBLIC)" -%token <ident> T_VAR "var (T_VAR)" -%token <ident> T_UNSET "unset (T_UNSET)" -%token <ident> T_ISSET "isset (T_ISSET)" -%token <ident> T_EMPTY "empty (T_EMPTY)" -%token <ident> T_HALT_COMPILER "__halt_compiler (T_HALT_COMPILER)" -%token <ident> T_CLASS "class (T_CLASS)" -%token <ident> T_TRAIT "trait (T_TRAIT)" -%token <ident> T_INTERFACE "interface (T_INTERFACE)" -%token <ident> T_EXTENDS "extends (T_EXTENDS)" -%token <ident> T_IMPLEMENTS "implements (T_IMPLEMENTS)" -%token <ident> T_NAMESPACE "namespace (T_NAMESPACE)" -%token <ident> T_LIST "list (T_LIST)" -%token <ident> T_ARRAY "array (T_ARRAY)" -%token <ident> T_CALLABLE "callable (T_CALLABLE)" -%token <ident> T_LINE "__LINE__ (T_LINE)" -%token <ident> T_FILE "__FILE__ (T_FILE)" -%token <ident> T_DIR "__DIR__ (T_DIR)" -%token <ident> T_CLASS_C "__CLASS__ (T_CLASS_C)" -%token <ident> T_TRAIT_C "__TRAIT__ (T_TRAIT_C)" -%token <ident> T_METHOD_C "__METHOD__ (T_METHOD_C)" -%token <ident> T_FUNC_C "__FUNCTION__ (T_FUNC_C)" -%token <ident> T_NS_C "__NAMESPACE__ (T_NS_C)" +%token <ast> T_ENCAPSED_AND_WHITESPACE "string content" +%token <ast> T_CONSTANT_ENCAPSED_STRING "quoted string" +%token <ast> T_STRING_VARNAME "variable name" +%token <ast> T_NUM_STRING "number" + +%token <ident> T_INCLUDE "'include'" +%token <ident> T_INCLUDE_ONCE "'include_once'" +%token <ident> T_EVAL "'eval'" +%token <ident> T_REQUIRE "'require'" +%token <ident> T_REQUIRE_ONCE "'require_once'" +%token <ident> T_LOGICAL_OR "'or'" +%token <ident> T_LOGICAL_XOR "'xor'" +%token <ident> T_LOGICAL_AND "'and'" +%token <ident> T_PRINT "'print'" +%token <ident> T_YIELD "'yield'" +%token <ident> T_YIELD_FROM "'yield from'" +%token <ident> T_INSTANCEOF "'instanceof'" +%token <ident> T_NEW "'new'" +%token <ident> T_CLONE "'clone'" +%token <ident> T_EXIT "'exit'" +%token <ident> T_IF "'if'" +%token <ident> T_ELSEIF "'elseif'" +%token <ident> T_ELSE "'else'" +%token <ident> T_ENDIF "'endif'" +%token <ident> T_ECHO "'echo'" +%token <ident> T_DO "'do'" +%token <ident> T_WHILE "'while'" +%token <ident> T_ENDWHILE "'endwhile'" +%token <ident> T_FOR "'for'" +%token <ident> T_ENDFOR "'endfor'" +%token <ident> T_FOREACH "'foreach'" +%token <ident> T_ENDFOREACH "'endforeach'" +%token <ident> T_DECLARE "'declare'" +%token <ident> T_ENDDECLARE "'enddeclare'" +%token <ident> T_AS "'as'" +%token <ident> T_SWITCH "'switch'" +%token <ident> T_ENDSWITCH "'endswitch'" +%token <ident> T_CASE "'case'" +%token <ident> T_DEFAULT "'default'" +%token <ident> T_MATCH "'match'" +%token <ident> T_BREAK "'break'" +%token <ident> T_CONTINUE "'continue'" +%token <ident> T_GOTO "'goto'" +%token <ident> T_FUNCTION "'function'" +%token <ident> T_FN "'fn'" +%token <ident> T_CONST "'const'" +%token <ident> T_RETURN "'return'" +%token <ident> T_TRY "'try'" +%token <ident> T_CATCH "'catch'" +%token <ident> T_FINALLY "'finally'" +%token <ident> T_THROW "'throw'" +%token <ident> T_USE "'use'" +%token <ident> T_INSTEADOF "'insteadof'" +%token <ident> T_GLOBAL "'global'" +%token <ident> T_STATIC "'static'" +%token <ident> T_ABSTRACT "'abstract'" +%token <ident> T_FINAL "'final'" +%token <ident> T_PRIVATE "'private'" +%token <ident> T_PROTECTED "'protected'" +%token <ident> T_PUBLIC "'public'" +%token <ident> T_VAR "'var'" +%token <ident> T_UNSET "'unset'" +%token <ident> T_ISSET "'isset'" +%token <ident> T_EMPTY "'empty'" +%token <ident> T_HALT_COMPILER "'__halt_compiler'" +%token <ident> T_CLASS "'class'" +%token <ident> T_TRAIT "'trait'" +%token <ident> T_INTERFACE "'interface'" +%token <ident> T_EXTENDS "'extends'" +%token <ident> T_IMPLEMENTS "'implements'" +%token <ident> T_NAMESPACE "'namespace'" +%token <ident> T_LIST "'list'" +%token <ident> T_ARRAY "'array'" +%token <ident> T_CALLABLE "'callable'" +%token <ident> T_LINE "'__LINE__'" +%token <ident> T_FILE "'__FILE__'" +%token <ident> T_DIR "'__DIR__'" +%token <ident> T_CLASS_C "'__CLASS__'" +%token <ident> T_TRAIT_C "'__TRAIT__'" +%token <ident> T_METHOD_C "'__METHOD__'" +%token <ident> T_FUNC_C "'__FUNCTION__'" +%token <ident> T_NS_C "'__NAMESPACE__'" %token END 0 "end of file" -%token T_PLUS_EQUAL "+= (T_PLUS_EQUAL)" -%token T_MINUS_EQUAL "-= (T_MINUS_EQUAL)" -%token T_MUL_EQUAL "*= (T_MUL_EQUAL)" -%token T_DIV_EQUAL "/= (T_DIV_EQUAL)" -%token T_CONCAT_EQUAL ".= (T_CONCAT_EQUAL)" -%token T_MOD_EQUAL "%= (T_MOD_EQUAL)" -%token T_AND_EQUAL "&= (T_AND_EQUAL)" -%token T_OR_EQUAL "|= (T_OR_EQUAL)" -%token T_XOR_EQUAL "^= (T_XOR_EQUAL)" -%token T_SL_EQUAL "<<= (T_SL_EQUAL)" -%token T_SR_EQUAL ">>= (T_SR_EQUAL)" -%token T_COALESCE_EQUAL "??= (T_COALESCE_EQUAL)" -%token T_BOOLEAN_OR "|| (T_BOOLEAN_OR)" -%token T_BOOLEAN_AND "&& (T_BOOLEAN_AND)" -%token T_IS_EQUAL "== (T_IS_EQUAL)" -%token T_IS_NOT_EQUAL "!= (T_IS_NOT_EQUAL)" -%token T_IS_IDENTICAL "=== (T_IS_IDENTICAL)" -%token T_IS_NOT_IDENTICAL "!== (T_IS_NOT_IDENTICAL)" -%token T_IS_SMALLER_OR_EQUAL "<= (T_IS_SMALLER_OR_EQUAL)" -%token T_IS_GREATER_OR_EQUAL ">= (T_IS_GREATER_OR_EQUAL)" -%token T_SPACESHIP "<=> (T_SPACESHIP)" -%token T_SL "<< (T_SL)" -%token T_SR ">> (T_SR)" -%token T_INC "++ (T_INC)" -%token T_DEC "-- (T_DEC)" -%token T_INT_CAST "(int) (T_INT_CAST)" -%token T_DOUBLE_CAST "(double) (T_DOUBLE_CAST)" -%token T_STRING_CAST "(string) (T_STRING_CAST)" -%token T_ARRAY_CAST "(array) (T_ARRAY_CAST)" -%token T_OBJECT_CAST "(object) (T_OBJECT_CAST)" -%token T_BOOL_CAST "(bool) (T_BOOL_CAST)" -%token T_UNSET_CAST "(unset) (T_UNSET_CAST)" -%token T_OBJECT_OPERATOR "-> (T_OBJECT_OPERATOR)" -%token T_DOUBLE_ARROW "=> (T_DOUBLE_ARROW)" -%token T_COMMENT "comment (T_COMMENT)" -%token T_DOC_COMMENT "doc comment (T_DOC_COMMENT)" -%token T_OPEN_TAG "open tag (T_OPEN_TAG)" -%token T_OPEN_TAG_WITH_ECHO "open tag with echo (T_OPEN_TAG_WITH_ECHO)" -%token T_CLOSE_TAG "close tag (T_CLOSE_TAG)" -%token T_WHITESPACE "whitespace (T_WHITESPACE)" -%token T_START_HEREDOC "heredoc start (T_START_HEREDOC)" -%token T_END_HEREDOC "heredoc end (T_END_HEREDOC)" -%token T_DOLLAR_OPEN_CURLY_BRACES "${ (T_DOLLAR_OPEN_CURLY_BRACES)" -%token T_CURLY_OPEN "{$ (T_CURLY_OPEN)" -%token T_PAAMAYIM_NEKUDOTAYIM ":: (T_PAAMAYIM_NEKUDOTAYIM)" -%token T_NS_SEPARATOR "\\ (T_NS_SEPARATOR)" -%token T_ELLIPSIS "... (T_ELLIPSIS)" -%token T_COALESCE "?? (T_COALESCE)" -%token T_POW "** (T_POW)" -%token T_POW_EQUAL "**= (T_POW_EQUAL)" -%token T_BAD_CHARACTER "invalid character (T_BAD_CHARACTER)" +%token T_PLUS_EQUAL "'+='" +%token T_MINUS_EQUAL "'-='" +%token T_MUL_EQUAL "'*='" +%token T_DIV_EQUAL "'/='" +%token T_CONCAT_EQUAL "'.='" +%token T_MOD_EQUAL "'%='" +%token T_AND_EQUAL "'&='" +%token T_OR_EQUAL "'|='" +%token T_XOR_EQUAL "'^='" +%token T_SL_EQUAL "'<<='" +%token T_SR_EQUAL "'>>='" +%token T_COALESCE_EQUAL "'??='" +%token T_BOOLEAN_OR "'||'" +%token T_BOOLEAN_AND "'&&'" +%token T_IS_EQUAL "'=='" +%token T_IS_NOT_EQUAL "'!='" +%token T_IS_IDENTICAL "'==='" +%token T_IS_NOT_IDENTICAL "'!=='" +%token T_IS_SMALLER_OR_EQUAL "'<='" +%token T_IS_GREATER_OR_EQUAL "'>='" +%token T_SPACESHIP "'<=>'" +%token T_SL "'<<'" +%token T_SR "'>>'" +%token T_INC "'++'" +%token T_DEC "'--'" +%token T_INT_CAST "'(int)'" +%token T_DOUBLE_CAST "'(double)'" +%token T_STRING_CAST "'(string)'" +%token T_ARRAY_CAST "'(array)'" +%token T_OBJECT_CAST "'(object)'" +%token T_BOOL_CAST "'(bool)'" +%token T_UNSET_CAST "'(unset)'" +%token T_OBJECT_OPERATOR "'->'" +%token T_DOUBLE_ARROW "'=>'" +%token T_COMMENT "comment" +%token T_DOC_COMMENT "doc comment" +%token T_OPEN_TAG "open tag" +%token T_OPEN_TAG_WITH_ECHO "'<?='" +%token T_CLOSE_TAG "'?>'" +%token T_WHITESPACE "whitespace" +%token T_START_HEREDOC "heredoc start" +%token T_END_HEREDOC "heredoc end" +%token T_DOLLAR_OPEN_CURLY_BRACES "'${'" +%token T_CURLY_OPEN "'{$'" +%token T_PAAMAYIM_NEKUDOTAYIM "'::'" +%token T_NS_SEPARATOR "'\\'" +%token T_ELLIPSIS "'...'" +%token T_COALESCE "'??'" +%token T_POW "'**'" +%token T_POW_EQUAL "'**='" +%token T_BAD_CHARACTER "invalid character" /* Token used to force a parse error from the lexer */ %token T_ERROR @@ -1438,15 +1438,16 @@ isset_variable: %% -/* Copy to YYRES the contents of YYSTR after stripping away unnecessary - quotes and backslashes, so that it's suitable for yyerror. The - heuristic is that double-quoting is unnecessary unless the string - contains an apostrophe, a comma, or backslash (other than - backslash-backslash). YYSTR is taken from yytname. If YYRES is - null, do not copy; instead, return the length of what the result - would have been. */ +/* Over-ride Bison formatting routine to give better token descriptions. + Copy to YYRES the contents of YYSTR for use in yyerror. + YYSTR is taken from yytname, from the %token declaration. + If YYRES is null, do not copy; instead, return the length of what + the result would have been. */ static YYSIZE_T zend_yytnamerr(char *yyres, const char *yystr) { + const char *toktype = yystr; + size_t toktype_len = strlen(toktype); + /* CG(parse_error) states: * 0 => yyres = NULL, yystr is the unexpected token * 1 => yyres = NULL, yystr is one of the expected tokens @@ -1460,63 +1461,141 @@ static YYSIZE_T zend_yytnamerr(char *yyres, const char *yystr) if (CG(parse_error) % 2 == 0) { /* The unexpected token */ char buffer[120]; - const unsigned char *end, *str, *tok1 = NULL, *tok2 = NULL; - unsigned int len = 0, toklen = 0, yystr_len; + const unsigned char *tokcontent, *tokcontent_end; + size_t tokcontent_len; CG(parse_error)++; if (LANG_SCNG(yy_text)[0] == 0 && LANG_SCNG(yy_leng) == 1 && - strcmp(yystr, "\"end of file\"") == 0) { + strcmp(toktype, "\"end of file\"") == 0) { if (yyres) { yystpcpy(yyres, "end of file"); } return sizeof("end of file")-1; } - str = LANG_SCNG(yy_text); - end = memchr(str, '\n', LANG_SCNG(yy_leng)); - yystr_len = (unsigned int)strlen(yystr); + /* Prevent the backslash getting doubled in the output (eugh) */ + if (strcmp(toktype, "\"'\\\\'\"") == 0) { + if (yyres) { + yystpcpy(yyres, "token \"\\\""); + } + return sizeof("token \"\\\"")-1; + } - if ((tok1 = memchr(yystr, '(', yystr_len)) != NULL - && (tok2 = zend_memrchr(yystr, ')', yystr_len)) != NULL) { - toklen = (tok2 - tok1) + 1; - } else { - tok1 = tok2 = NULL; - toklen = 0; + /* Avoid unreadable """ */ + /* "'" would theoretically be just as bad, but is never currently parsed as a separate token */ + if (strcmp(toktype, "'\"'") == 0) { + if (yyres) { + yystpcpy(yyres, "double-quote mark"); + } + return sizeof("double-quote mark")-1; } - if (end == NULL) { - len = LANG_SCNG(yy_leng) > 30 ? 30 : LANG_SCNG(yy_leng); - } else { - len = (end - str) > 30 ? 30 : (end - str); + /* Strip off the outer quote marks */ + if (toktype_len >= 2 && *toktype == '"') { + toktype++; + toktype_len -= 2; } - if (yyres) { - if (toklen) { - snprintf(buffer, sizeof(buffer), "'%.*s' %.*s", len, str, toklen, tok1); - } else { - snprintf(buffer, sizeof(buffer), "'%.*s'", len, str); + + /* If the token always has one form, the %token line should have a single-quoted name */ + /* The parser rules also include single-character un-named tokens which will be single-quoted here */ + /* We re-format this with double quotes here to ensure everything's consistent */ + if (toktype_len > 0 && *toktype == '\'') { + if (yyres) { + snprintf(buffer, sizeof(buffer), "token \"%.*s\"", (int)toktype_len-2, toktype+1); + yystpcpy(yyres, buffer); + } + return toktype_len + sizeof("token ")-1; + } + + /* Fetch the content of the last seen token from global lexer state */ + tokcontent = LANG_SCNG(yy_text); + tokcontent_len = LANG_SCNG(yy_leng); + + /* For T_BAD_CHARACTER, the content probably won't be a printable char */ + /* Also, "unexpected invalid character" sounds a bit redundant */ + if (tokcontent_len == 1 && strcmp(yystr, "\"invalid character\"") == 0) { + if (yyres) { + snprintf(buffer, sizeof(buffer), "character 0x%02hhX", *tokcontent); + yystpcpy(yyres, buffer); + } + return sizeof("character 0x00")-1; + } + + /* Truncate at line end to avoid messing up log formats */ + tokcontent_end = memchr(tokcontent, '\n', tokcontent_len); + if (tokcontent_end != NULL) { + tokcontent_len = (tokcontent_end - tokcontent); + } + + /* Try to be helpful about what kind of string was found, before stripping the quotes */ + if (tokcontent_len > 0 && strcmp(yystr, "\"quoted string\"") == 0) { + if (*tokcontent == '"') { + toktype = "double-quoted string"; + toktype_len = sizeof("double-quoted string")-1; } + else if (*tokcontent == '\'') { + toktype = "single-quoted string"; + toktype_len = sizeof("single-quoted string")-1; + } + } + + /* For quoted strings, strip off another layer of quotes to avoid putting quotes inside quotes */ + if (tokcontent_len > 0 && (*tokcontent == '\'' || *tokcontent=='"')) { + tokcontent++; + tokcontent_len--; + } + if (tokcontent_len > 0 && (tokcontent[tokcontent_len-1] == '\'' || tokcontent[tokcontent_len-1] == '"')) { + tokcontent_len--; + } + + /* Truncate to 30 characters and add a ... */ + if (tokcontent_len > 30 + sizeof("...")-1) { + if (yyres) { + snprintf(buffer, sizeof(buffer), "%.*s \"%.*s...\"", (int)toktype_len, toktype, 30, tokcontent); + yystpcpy(yyres, buffer); + } + return toktype_len + 30 + sizeof(" \"...\"")-1; + } + + if (yyres) { + snprintf(buffer, sizeof(buffer), "%.*s \"%.*s\"", (int)toktype_len, toktype, (int)tokcontent_len, tokcontent); yystpcpy(yyres, buffer); } - return len + (toklen ? toklen + 1 : 0) + 2; + return toktype_len + tokcontent_len + sizeof(" \"\"")-1; } /* One of the expected tokens */ - if (!yyres) { - return strlen(yystr) - (*yystr == '"' ? 2 : 0); + + /* Prevent the backslash getting doubled in the output (eugh) */ + if (strcmp(toktype, "\"'\\\\'\"") == 0) { + if (yyres) { + yystpcpy(yyres, "\"\\\""); + } + return sizeof("\"\\\"")-1; } - if (*yystr == '"') { + /* Strip off the outer quote marks */ + if (toktype_len >= 2 && *toktype == '"') { + toktype++; + toktype_len -= 2; + } + + if (yyres) { YYSIZE_T yyn = 0; - const char *yyp = yystr; - for (; *++yyp != '"'; ++yyn) { - yyres[yyn] = *yyp; + for (; yyn < toktype_len; ++yyn) { + /* Replace single quotes with double for consistency */ + if (toktype[yyn] == '\'') { + yyres[yyn] = '"'; + } + else { + yyres[yyn] = toktype[yyn]; + } } - yyres[yyn] = '\0'; - return yyn; + yyres[toktype_len] = '\0'; } - yystpcpy(yyres, yystr); - return strlen(yystr); + + return toktype_len; } |