summaryrefslogtreecommitdiff
path: root/clang-tools-extra/pseudo
diff options
context:
space:
mode:
authorSam McCall <sam.mccall@gmail.com>2022-07-22 10:25:16 +0200
committerUtkarsh Saxena <usx@google.com>2022-07-22 10:35:06 +0200
commitd9d554a3f4640e8f1ed5c0ae408740861715b897 (patch)
treebc55f5bd752fd2a2335cf749dff1794650924d68 /clang-tools-extra/pseudo
parentfc99f18a20f47805f7ac127cc2147f316c6459a5 (diff)
downloadllvm-d9d554a3f4640e8f1ed5c0ae408740861715b897.tar.gz
[pseudo] Add ambiguity & unparseability metrics to -print-statistics
These can be used to quantify parsing improvements from a change. Differential Revision: https://reviews.llvm.org/D130199
Diffstat (limited to 'clang-tools-extra/pseudo')
-rw-r--r--clang-tools-extra/pseudo/test/glr.cpp3
-rw-r--r--clang-tools-extra/pseudo/tool/ClangPseudo.cpp40
2 files changed, 40 insertions, 3 deletions
diff --git a/clang-tools-extra/pseudo/test/glr.cpp b/clang-tools-extra/pseudo/test/glr.cpp
index 24b2ac05f6f1..221725c6f089 100644
--- a/clang-tools-extra/pseudo/test/glr.cpp
+++ b/clang-tools-extra/pseudo/test/glr.cpp
@@ -29,3 +29,6 @@ void foo() {
// CHECK-NEXT: 1 type-name
// CHECK-EMPTY:
// CHECK-NEXT: 0 Opaque nodes:
+// CHECK-EMPTY:
+// CHECK-NEXT: Ambiguity: 0.40 misparses/token
+// CHECK-NEXT: Unparsed: 0.00%
diff --git a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
index 2a2d8eda4c20..294098a3a5c1 100644
--- a/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
+++ b/clang-tools-extra/pseudo/tool/ClangPseudo.cpp
@@ -24,6 +24,8 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Signals.h"
+using clang::pseudo::ForestNode;
+using clang::pseudo::Token;
using clang::pseudo::TokenStream;
using llvm::cl::desc;
using llvm::cl::init;
@@ -174,9 +176,8 @@ int main(int argc, char *argv[]) {
llvm::outs() << "GSS bytes: " << GSS.bytes()
<< " nodes: " << GSS.nodesCreated() << "\n";
- for (auto &P :
- {std::make_pair("Ambiguous", clang::pseudo::ForestNode::Ambiguous),
- std::make_pair("Opaque", clang::pseudo::ForestNode::Opaque)}) {
+ for (auto &P : {std::make_pair("Ambiguous", ForestNode::Ambiguous),
+ std::make_pair("Opaque", ForestNode::Opaque)}) {
clang::pseudo::NodeStats Stats(
Root, [&](const auto &N) { return N.kind() == P.second; });
llvm::outs() << "\n" << Stats.Total << " " << P.first << " nodes:\n";
@@ -184,6 +185,39 @@ int main(int argc, char *argv[]) {
llvm::outs() << llvm::formatv(" {0,3} {1}\n", S.second,
Lang.G.symbolName(S.first));
}
+
+ // Metrics for how imprecise parsing was.
+ // These are rough but aim to be:
+ // - linear: if we eliminate half the errors the metric should halve
+ // - length-independent
+ unsigned UnparsedTokens = 0; // Tokens covered by Opaque. (not unique)
+ unsigned Misparses = 0; // Sum of alternatives-1
+ llvm::DenseSet<const ForestNode *> Visited;
+ auto DFS = [&](const ForestNode &N, Token::Index End, auto &DFS) -> void {
+ if (N.kind() == ForestNode::Opaque) {
+ UnparsedTokens += End - N.startTokenIndex();
+ } else if (N.kind() == ForestNode::Ambiguous) {
+ Misparses += N.alternatives().size() - 1;
+ for (const auto *C : N.alternatives())
+ if (Visited.insert(C).second)
+ DFS(*C, End, DFS);
+ } else if (N.kind() == ForestNode::Sequence) {
+ for (unsigned I = 0, E = N.children().size(); I < E; ++I)
+ if (Visited.insert(N.children()[I]).second)
+ DFS(*N.children()[I],
+ I + 1 == N.children().size()
+ ? End
+ : N.children()[I + 1]->startTokenIndex(),
+ DFS);
+ }
+ };
+ unsigned Len = ParseableStream->tokens().size();
+ DFS(Root, Len, DFS);
+ llvm::outs() << "\n";
+ llvm::outs() << llvm::formatv("Ambiguity: {0} misparses/token\n",
+ double(Misparses) / Len);
+ llvm::outs() << llvm::formatv("Unparsed: {0}%\n",
+ 100.0 * UnparsedTokens / Len);
}
}