summaryrefslogtreecommitdiff
path: root/chromium/base/substring_set_matcher/substring_set_matcher.h
blob: fcb5076f8ebc58fbf48b6f2514bbec9e2540eebb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
#define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_

#include <stdint.h>

#include <limits>
#include <set>
#include <string>
#include <vector>

#include "base/base_export.h"
#include "base/check_op.h"
#include "base/substring_set_matcher/matcher_string_pattern.h"

namespace base {

// Class that store a set of string patterns and can find for a string S,
// which string patterns occur in S.
class BASE_EXPORT SubstringSetMatcher {
 public:
  SubstringSetMatcher() = default;
  SubstringSetMatcher(const SubstringSetMatcher&) = delete;
  SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;

  ~SubstringSetMatcher();

  // Registers all |patterns|. Each pattern needs to have a unique ID and all
  // pattern strings must be unique. Build() should be called exactly once
  // (before it is called, the tree is empty).
  //
  // Complexity:
  //    Let n = number of patterns.
  //    Let S = sum of pattern lengths.
  //    Let k = range of char. Generally 256.
  // Complexity = O(nlogn + S * logk)
  // nlogn comes from sorting the patterns.
  // log(k) comes from our usage of std::map to store edges.
  //
  // Returns true on success (may fail if e.g. if the tree gets too many nodes).
  bool Build(const std::vector<MatcherStringPattern>& patterns);
  bool Build(std::vector<const MatcherStringPattern*> patterns);

  // Matches |text| against all registered MatcherStringPatterns. Stores the IDs
  // of matching patterns in |matches|. |matches| is not cleared before adding
  // to it.
  // Complexity:
  //    Let t = length of |text|.
  //    Let k = range of char. Generally 256.
  //    Let z = number of matches returned.
  // Complexity = O(t * logk + zlogz)
  bool Match(const std::string& text,
             std::set<MatcherStringPattern::ID>* matches) const;

  // As Match(), except it returns immediately on the first match.
  // This allows true/false matching to be done without any dynamic
  // memory allocation.
  // Complexity = O(t * logk)
  bool AnyMatch(const std::string& text) const;

  // Returns true if this object retains no allocated data.
  bool IsEmpty() const { return is_empty_; }

  // Returns the dynamically allocated memory usage in bytes. See
  // base/trace_event/memory_usage_estimator.h for details.
  size_t EstimateMemoryUsage() const;

 private:
  // Represents the index of the node within |tree_|. It is specifically
  // uint32_t so that we can be sure it takes up 4 bytes when stored together
  // with the 9-bit label (so 23 bits are allocated to the NodeID, even though
  // it is exposed as uint32_t). If the computed size of |tree_| is
  // larger than what can be stored within 23 bits, Build() will fail.
  using NodeID = uint32_t;

  // This is the maximum possible size of |tree_| and hence can't be a valid ID.
  static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;

  static constexpr NodeID kRootID = 0;

  // A node of an Aho Corasick Tree. See
  // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
  // to understand the algorithm.
  //
  // The algorithm is based on the idea of building a trie of all registered
  // patterns. Each node of the tree is annotated with a set of pattern
  // IDs that are used to report matches.
  //
  // The root of the trie represents an empty match. If we were looking whether
  // any registered pattern matches a text at the beginning of the text (i.e.
  // whether any pattern is a prefix of the text), we could just follow
  // nodes in the trie according to the matching characters in the text.
  // E.g., if text == "foobar", we would follow the trie from the root node
  // to its child labeled 'f', from there to child 'o', etc. In this process we
  // would report all pattern IDs associated with the trie nodes as matches.
  //
  // As we are not looking for all prefix matches but all substring matches,
  // this algorithm would need to compare text.substr(0), text.substr(1), ...
  // against the trie, which is in O(|text|^2).
  //
  // The Aho Corasick algorithm improves this runtime by using failure edges.
  // In case we have found a partial match of length k in the text
  // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
  // a node at depth k, but cannot find a match in the trie for character
  // text[i + k] at depth k + 1, we follow a failure edge. This edge
  // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
  // is a prefix of any registered pattern.
  //
  // If your brain thinks "Forget it, let's go shopping.", don't worry.
  // Take a nap and read an introductory text on the Aho Corasick algorithm.
  // It will make sense. Eventually.

  // An edge internal to the tree. We pack the label (character we are
  // matching on) and the destination node ID into 32 bits, to save memory.
  // We also use these edges as a sort of generic key/value store for
  // some special values that not all nodes will have; this also saves on
  // memory over the otherwise obvious choice of having them as struct fields,
  // as it means we do not to store them when they are not present.
  struct AhoCorasickEdge {
    // char (unsigned, so [0..255]), or a special label below.
    uint32_t label : 9;
    NodeID node_id : 23;
  };

  // Node index that failure edge leads to. The failure node corresponds to
  // the node which represents the longest proper suffix (include empty
  // string) of the string represented by this node. Not stored if it is
  // equal to kRootID (since that is the most common value).
  //
  // NOTE: Assigning |root| as the failure edge for itself doesn't strictly
  // abide by the definition of "proper" suffix. The proper suffix of an empty
  // string should probably be defined as null, but we assign it to the |root|
  // to simplify the code and have the invariant that the failure edge is always
  // defined.
  static constexpr uint32_t kFailureNodeLabel = 0x100;
  static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;

  // Node index that corresponds to the longest proper suffix (including empty
  // suffix) of this node and which also represents the end of a pattern.
  // Does not have to exist.
  static constexpr uint32_t kOutputLinkLabel = 0x101;

  // If present, this node represents the end of a pattern. It stores the ID of
  // the corresponding pattern (ie., it is not really a NodeID, but a
  // MatcherStringPattern::ID).
  static constexpr uint32_t kMatchIDLabel = 0x102;

  // Used for uninitialized label slots; used so that we do not have to test for
  // them in other ways, since we know the data will be initialized and never
  // match any other labels.
  static constexpr uint32_t kEmptyLabel = 0x103;

  // A node in the trie, packed tightly together so that it occupies 12 bytes
  // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
  // comment on edges_).
  class alignas(AhoCorasickEdge) AhoCorasickNode {
   public:
    AhoCorasickNode();
    ~AhoCorasickNode();
    AhoCorasickNode(AhoCorasickNode&& other);
    AhoCorasickNode& operator=(AhoCorasickNode&& other);

    NodeID GetEdge(uint32_t label) const {
      if (edges_capacity_ != 0) {
        return GetEdgeNoInline(label);
      }
      static_assert(kNumInlineEdges == 2, "Code below needs updating");
      if (edges_.inline_edges[0].label == label) {
        return edges_.inline_edges[0].node_id;
      }
      if (edges_.inline_edges[1].label == label) {
        return edges_.inline_edges[1].node_id;
      }
      return kInvalidNodeID;
    }
    NodeID GetEdgeNoInline(uint32_t label) const;
    void SetEdge(uint32_t label, NodeID node);
    const AhoCorasickEdge* edges() const {
      // NOTE: Returning edges_.inline_edges here is fine, because it's
      // the first thing in the struct (see the comment on edges_).
      DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
                        alignof(AhoCorasickEdge));
      return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
    }

    NodeID failure() const {
      // NOTE: Even if num_edges_ == 0, we are not doing anything
      // undefined, as we will have room for at least two edges
      // and empty edges are set to kEmptyLabel.
      const AhoCorasickEdge& first_edge = *edges();
      if (first_edge.label == kFailureNodeLabel) {
        return first_edge.node_id;
      } else {
        return kRootID;
      }
    }
    void SetFailure(NodeID failure);

    void SetMatchID(MatcherStringPattern::ID id) {
      DCHECK(!IsEndOfPattern());
      DCHECK(id < kInvalidNodeID);  // This is enforced by Build().
      SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
      has_outputs_ = true;
    }

    // Returns true if this node corresponds to a pattern.
    bool IsEndOfPattern() const {
      if (!has_outputs_) {
        // Fast reject.
        return false;
      }
      return GetEdge(kMatchIDLabel) != kInvalidNodeID;
    }

    // Must only be called if |IsEndOfPattern| returns true for this node.
    MatcherStringPattern::ID GetMatchID() const {
      DCHECK(IsEndOfPattern());
      return GetEdge(kMatchIDLabel);
    }

    void SetOutputLink(NodeID node) {
      if (node != kInvalidNodeID) {
        SetEdge(kOutputLinkLabel, node);
        has_outputs_ = true;
      }
    }
    NodeID output_link() const { return GetEdge(kOutputLinkLabel); }

    size_t EstimateMemoryUsage() const;
    size_t num_edges() const {
      if (edges_capacity_ == 0) {
        return kNumInlineEdges - num_free_edges_;
      } else {
        return edges_capacity_ - num_free_edges_;
      }
    }

    bool has_outputs() const { return has_outputs_; }

   private:
    // Outgoing edges of current node, including failure edge and output links.
    // Most nodes have only one or two (or even zero) edges, not the last
    // because many of them are leaves. Thus, we make an optimization for this
    // common case; instead of a pointer to an edge array on the heap, we can
    // pack two edges inline where the pointer would otherwise be. This reduces
    // memory usage dramatically, as well as saving us a cache-line fetch.
    //
    // Note that even though most nodes have fewer outgoing edges, most nodes
    // that we actually traverse will have any of them. This apparent
    // contradiction is because we tend to spend more of our time near the root
    // of the trie, where it is wide. This means that another layout would be
    // possible: If we wanted to, non-inline nodes could simply store an array
    // of 259 (256 possible characters plus the three special label types)
    // edges, indexed directly by label type. This would use 20–50% more RAM,
    // but also increases the speed of lookups due to removing the search loop.
    //
    // The nodes are generally unordered; since we typically index text, even
    // the root will rarely be more than 20–30 wide, and at that point, it's
    // better to just do a linear search than a binary one (which fares poorly
    // on branch predictors). However, a special case, we put kFailureNodeLabel
    // in the first slot if it exists (ie., is not equal to kRootID), since we
    // need to access that label during every single node we look at during
    // traversal.
    //
    // NOTE: Keep this the first member in the struct, so that inline_edges gets
    // 4-aligned (since the class is marked as such, despite being packed.
    // Otherwise, edges() can return an unaligned pointer marked as aligned
    // (the unalignedness gets lost).
    static constexpr int kNumInlineEdges = 2;
    union {
      // Out-of-line edge storage, having room for edges_capacity_ elements.
      // Note that due to __attribute__((packed)) below, this pointer may be
      // unaligned on 64-bit platforms, causing slightly less efficient
      // access to it in some cases.
      AhoCorasickEdge* edges;

      // Inline edge storage, used if edges_capacity_ == 0.
      AhoCorasickEdge inline_edges[kNumInlineEdges];
    } edges_;

    // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
    // ie., hitting this node during traversal will create one or more
    // matches. This is redundant, but since every single lookup during
    // traversal needs this, it saves a few searches for us.
    bool has_outputs_ = false;

    // Number of unused left in edges_. Edges are always allocated from the
    // beginning and never deleted; those after num_edges_ will be marked with
    // kEmptyLabel (and have an undefined node_id). We store the number of
    // free edges instead of the more common number of _used_ edges, to be
    // sure that we are able to fit it in an uint8_t. num_edges() provides
    // a useful abstraction over this.
    uint8_t num_free_edges_ = kNumInlineEdges;

    // How many edges we have allocated room for (can never be more than
    // kEmptyLabel + 1). If equal to zero, we are not using heap storage,
    // but instead are using inline_edges.
    //
    // If not equal to zero, will be a multiple of 4, so that we can use
    // SIMD to accelerate looking for edges.
    uint16_t edges_capacity_ = 0;
  } __attribute__((packed));

  using SubstringPatternVector = std::vector<const MatcherStringPattern*>;

  // Given the set of patterns, compute how many nodes will the corresponding
  // Aho-Corasick tree have. Note that |patterns| need to be sorted.
  NodeID GetTreeSize(
      const std::vector<const MatcherStringPattern*>& patterns) const;

  void BuildAhoCorasickTree(const SubstringPatternVector& patterns);

  // Inserts a path for |pattern->pattern()| into the tree and adds
  // |pattern->id()| to the set of matches.
  void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);

  void CreateFailureAndOutputEdges();

  // Adds all pattern IDs to |matches| which are a suffix of the string
  // represented by |node|.
  void AccumulateMatchesForNode(
      const AhoCorasickNode* node,
      std::set<MatcherStringPattern::ID>* matches) const;

  // The nodes of a Aho-Corasick tree.
  std::vector<AhoCorasickNode> tree_;

  bool is_empty_ = true;
};

}  // namespace base

#endif  // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_