summaryrefslogtreecommitdiff
path: root/clang-tools-extra/pseudo
diff options
context:
space:
mode:
authorSam McCall <sam.mccall@gmail.com>2022-08-19 15:00:59 +0200
committerSam McCall <sam.mccall@gmail.com>2022-08-19 15:07:36 +0200
commit2cc7463c85c056e956ad332448189ce3e9004182 (patch)
tree20b999271235d0fa22bc9aaca366da952fd6999f /clang-tools-extra/pseudo
parent13b2a0c69ba69c1317308b6e5a8279160561c4b2 (diff)
downloadllvm-2cc7463c85c056e956ad332448189ce3e9004182.tar.gz
[pseudo] Perform unconstrained reduction prior to recovery.
Our GLR uses lookahead: only perform reductions that might be consumed by the shift immediately following. However when shift fails and so reduce is followed by recovery instead, this restriction is incorrect and leads to missing heads. In turn this means certain recovery strategies can't be made to work. e.g. ``` ns := NAMESPACE { namespace-body } [recover=Skip] ns-body := namespace_opt ``` When `namespace { namespace {` is parsed, we can recover the inner `ns` (using the `Skip` strategy to ignore the missing `}`). However this `namespace` will not be reduced to a `namespace-body` as EOF is not in the follow-set, and so we are unable to recover the outer `ns`. This patch fixes this by tracking which heads were produced by constrained reduce, and discarding and rebuilding them before performing recovery. This is a prerequisite for the `Skip` strategy mentioned above, though there are some other limitations we need to address too. Reviewed By: hokein Differential Revision: https://reviews.llvm.org/D130523
Diffstat (limited to 'clang-tools-extra/pseudo')
-rw-r--r--clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h6
-rw-r--r--clang-tools-extra/pseudo/lib/GLR.cpp18
-rw-r--r--clang-tools-extra/pseudo/unittests/GLRTest.cpp26
3 files changed, 47 insertions, 3 deletions
diff --git a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
index 890a5125a1ef..8d8f8f74f16e 100644
--- a/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
+++ b/clang-tools-extra/pseudo/include/clang-pseudo/grammar/LRTable.h
@@ -105,7 +105,11 @@ public:
bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
assert(isToken(Terminal));
assert(isNonterminal(Nonterminal));
- return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
+ // tok::unknown is a sentinel value used in recovery: can follow anything.
+ if (tok::unknown)
+ return true;
+ return Terminal == tokenSymbol(tok::unknown) ||
+ FollowSets.test(tok::NUM_TOKENS * Nonterminal +
symbolToToken(Terminal));
}
diff --git a/clang-tools-extra/pseudo/lib/GLR.cpp b/clang-tools-extra/pseudo/lib/GLR.cpp
index 10a94860ffc9..1765eb18732c 100644
--- a/clang-tools-extra/pseudo/lib/GLR.cpp
+++ b/clang-tools-extra/pseudo/lib/GLR.cpp
@@ -601,6 +601,10 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
std::vector<const GSS::Node *> Heads = {GSS.addNode(/*State=*/StartState,
/*ForestNode=*/nullptr,
{})};
+ // Invariant: Heads is partitioned by source: {shifted | reduced}.
+ // HeadsPartition is the index of the first head formed by reduction.
+ // We use this to discard and recreate the reduced heads during recovery.
+ unsigned HeadsPartition = 0;
std::vector<const GSS::Node *> NextHeads;
auto MaybeGC = [&, Roots(std::vector<const GSS::Node *>{}), I(0u)]() mutable {
assert(NextHeads.empty() && "Running GC at the wrong time!");
@@ -623,8 +627,17 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
// If we weren't able to consume the token, try to skip over some tokens
// so we can keep parsing.
if (NextHeads.empty()) {
- // FIXME: Heads may not be fully reduced, because our reductions were
- // constrained by lookahead (but lookahead is meaningless to recovery).
+ // The reduction in the previous round was constrained by lookahead.
+ // On valid code this only rejects dead ends, but on broken code we should
+ // consider all possibilities.
+ //
+ // We discard all heads formed by reduction, and recreate them without
+ // this constraint. This may duplicate some nodes, but it's rare.
+ LLVM_DEBUG(llvm::dbgs() << "Shift failed, will attempt recovery. "
+ "Re-reducing without lookahead.");
+ Heads.resize(HeadsPartition);
+ Reduce(Heads, /*allow all reductions*/ tokenSymbol(tok::unknown));
+
glrRecover(Heads, I, Params, Lang, NextHeads);
if (NextHeads.empty())
// FIXME: Ensure the `_ := start-symbol` rules have a fallback
@@ -636,6 +649,7 @@ const ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
// Form nonterminals containing the token we just consumed.
SymbolID Lookahead =
I == Terminals.size() ? tokenSymbol(tok::eof) : Terminals[I].symbol();
+ HeadsPartition = NextHeads.size();
Reduce(NextHeads, Lookahead);
// Prepare for the next token.
std::swap(Heads, NextHeads);
diff --git a/clang-tools-extra/pseudo/unittests/GLRTest.cpp b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
index 7abf9295d157..397ad6d3e800 100644
--- a/clang-tools-extra/pseudo/unittests/GLRTest.cpp
+++ b/clang-tools-extra/pseudo/unittests/GLRTest.cpp
@@ -625,6 +625,32 @@ TEST_F(GLRTest, RecoverTerminal) {
"[ 1, end) └─; := <opaque>\n");
}
+TEST_F(GLRTest, RecoverUnrestrictedReduce) {
+ // Here, ! is not in any rule and therefore not in the follow set of `word`.
+ // We would not normally reduce `word := IDENTIFIER`, but do so for recovery.
+
+ build(R"bnf(
+ _ := sentence
+
+ word := IDENTIFIER
+ sentence := word word [recover=AcceptAnyTokenInstead]
+ )bnf");
+
+ clang::LangOptions LOptions;
+ const TokenStream &Tokens = cook(lex("id !", LOptions), LOptions);
+ TestLang.Table = LRTable::buildSLR(TestLang.G);
+ TestLang.RecoveryStrategies.try_emplace(
+ extensionID("AcceptAnyTokenInstead"),
+ [](Token::Index Start, const TokenStream &Stream) { return Start + 1; });
+
+ const ForestNode &Parsed =
+ glrParse({Tokens, Arena, GSStack}, id("sentence"), TestLang);
+ EXPECT_EQ(Parsed.dumpRecursive(TestLang.G),
+ "[ 0, end) sentence := word word [recover=AcceptAnyTokenInstead]\n"
+ "[ 0, 1) ├─word := IDENTIFIER\n"
+ "[ 0, 1) │ └─IDENTIFIER := tok[0]\n"
+ "[ 1, end) └─word := <opaque>\n");
+}
TEST_F(GLRTest, NoExplicitAccept) {
build(R"bnf(