summaryrefslogtreecommitdiff
path: root/src/third_party/re2/dist/re2/testing/backtrack.cc
blob: 216d259802f21e5153e307ba600b9210fe592f7c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Tested by search_test.cc, exhaustive_test.cc, tester.cc
//
// Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
// except that it remembers where it has been, trading a lot of
// memory for a lot of time. It exists only for testing purposes.
//
// Let me repeat that.
//
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
//   - It uses a ton of memory.
//   - It uses a ton of stack.
//   - It uses CHECK and LOG(FATAL).
//   - It implements unanchored search by repeated anchored search.
//
// On the other hand, it is very simple and a good reference
// implementation for the more complicated regexp packages.
//
// In BUILD, this file is linked into the ":testing" library,
// not the main library, in order to make it harder to pick up
// accidentally.

#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include "util/util.h"
#include "util/logging.h"
#include "re2/pod_array.h"
#include "re2/prog.h"
#include "re2/regexp.h"

namespace re2 {

// Backtracker holds the state for a backtracking search.
//
// Excluding the search parameters, the main search state
// is just the "capture registers", which record, for the
// current execution, the string position at which each
// parenthesis was passed.  cap_[0] and cap_[1] are the
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
//
// To avoid infinite loops during backtracking on expressions
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
// pairs that have already been explored and are thus not worth
// re-exploring if we get there via another path.  Modern backtracking
// libraries engineer their program representation differently, to make
// such infinite loops possible to avoid without keeping a giant visited_
// bitmap, but visited_ works fine for a reference implementation
// and it has the nice benefit of making the search run in linear time.
class Backtracker {
 public:
  explicit Backtracker(Prog* prog);

  bool Search(const StringPiece& text, const StringPiece& context,
              bool anchored, bool longest,
              StringPiece* submatch, int nsubmatch);

 private:
  // Explores from instruction id at string position p looking for a match.
  // Returns true if found (so that caller can stop trying other possibilities).
  bool Visit(int id, const char* p);

  // Tries instruction id at string position p.
  // Returns true if a match is found.
  bool Try(int id, const char* p);

  // Search parameters
  Prog* prog_;              // program being run
  StringPiece text_;        // text being searched
  StringPiece context_;     // greater context of text being searched
  bool anchored_;           // whether search is anchored at text.begin()
  bool longest_;            // whether search wants leftmost-longest match
  bool endmatch_;           // whether search must end at text.end()
  StringPiece *submatch_;   // submatches to fill in
  int nsubmatch_;           //   # of submatches to fill in

  // Search state
  const char* cap_[64];         // capture registers
  PODArray<uint32_t> visited_;  // bitmap: (Inst*, char*) pairs visited

  Backtracker(const Backtracker&) = delete;
  Backtracker& operator=(const Backtracker&) = delete;
};

Backtracker::Backtracker(Prog* prog)
  : prog_(prog),
    anchored_(false),
    longest_(false),
    endmatch_(false),
    submatch_(NULL),
    nsubmatch_(0) {
}

// Runs a backtracking search.
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
                         bool anchored, bool longest,
                         StringPiece* submatch, int nsubmatch) {
  text_ = text;
  context_ = context;
  if (context_.data() == NULL)
    context_ = text;
  if (prog_->anchor_start() && text.begin() > context_.begin())
    return false;
  if (prog_->anchor_end() && text.end() < context_.end())
    return false;
  anchored_ = anchored | prog_->anchor_start();
  longest_ = longest | prog_->anchor_end();
  endmatch_ = prog_->anchor_end();
  submatch_ = submatch;
  nsubmatch_ = nsubmatch;
  CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
  memset(cap_, 0, sizeof cap_);

  // We use submatch_[0] for our own bookkeeping,
  // so it had better exist.
  StringPiece sp0;
  if (nsubmatch < 1) {
    submatch_ = &sp0;
    nsubmatch_ = 1;
  }
  submatch_[0] = StringPiece();

  // Allocate new visited_ bitmap -- size is proportional
  // to text, so have to reallocate on each call to Search.
  int nvisited = prog_->size() * static_cast<int>(text.size()+1);
  nvisited = (nvisited + 31) / 32;
  visited_ = PODArray<uint32_t>(nvisited);
  memset(visited_.data(), 0, nvisited*sizeof visited_[0]);

  // Anchored search must start at text.begin().
  if (anchored_) {
    cap_[0] = text.data();
    return Visit(prog_->start(), text.data());
  }

  // Unanchored search, starting from each possible text position.
  // Notice that we have to try the empty string at the end of
  // the text, so the loop condition is p <= text.end(), not p < text.end().
  for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
    cap_[0] = p;
    if (Visit(prog_->start(), p))  // Match must be leftmost; done.
      return true;
    // Avoid invoking undefined behavior (arithmetic on a null pointer)
    // by simply not continuing the loop.
    if (p == NULL)
      break;
  }
  return false;
}

// Explores from instruction id at string position p looking for a match.
// Return true if found (so that caller can stop trying other possibilities).
bool Backtracker::Visit(int id, const char* p) {
  // Check bitmap.  If we've already explored from here,
  // either it didn't match or it did but we're hoping for a better match.
  // Either way, don't go down that road again.
  CHECK(p <= text_.data() + text_.size());
  int n = id * static_cast<int>(text_.size()+1) +
          static_cast<int>(p-text_.data());
  CHECK_LT(n/32, visited_.size());
  if (visited_[n/32] & (1 << (n&31)))
    return false;
  visited_[n/32] |= 1 << (n&31);

  Prog::Inst* ip = prog_->inst(id);
  if (Try(id, p)) {
    if (longest_ && !ip->last())
      Visit(id+1, p);
    return true;
  }
  if (!ip->last())
    return Visit(id+1, p);
  return false;
}

// Tries instruction id at string position p.
// Returns true if a match is found.
bool Backtracker::Try(int id, const char* p) {
  // Pick out byte at current position.  If at end of string,
  // have to explore in hope of finishing a match.  Use impossible byte -1.
  int c = -1;
  if (p < text_.data() + text_.size())
    c = *p & 0xFF;

  Prog::Inst* ip = prog_->inst(id);
  switch (ip->opcode()) {
    default:
      LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
      return false;  // not reached

    case kInstAltMatch:
      // Ignored.
      return false;

    case kInstByteRange:
      if (ip->Matches(c))
        return Visit(ip->out(), p+1);
      return false;

    case kInstCapture:
      if (0 <= ip->cap() &&
          ip->cap() < static_cast<int>(arraysize(cap_))) {
        // Capture p to register, but save old value.
        const char* q = cap_[ip->cap()];
        cap_[ip->cap()] = p;
        bool ret = Visit(ip->out(), p);
        // Restore old value as we backtrack.
        cap_[ip->cap()] = q;
        return ret;
      }
      return Visit(ip->out(), p);

    case kInstEmptyWidth:
      if (ip->empty() & ~Prog::EmptyFlags(context_, p))
        return false;
      return Visit(ip->out(), p);

    case kInstNop:
      return Visit(ip->out(), p);

    case kInstMatch:
      // We found a match.  If it's the best so far, record the
      // parameters in the caller's submatch_ array.
      if (endmatch_ && p != context_.data() + context_.size())
        return false;
      cap_[1] = p;
      if (submatch_[0].data() == NULL ||
          (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
        // First match so far - or better match.
        for (int i = 0; i < nsubmatch_; i++)
          submatch_[i] = StringPiece(
              cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
      }
      return true;

    case kInstFail:
      return false;
  }
}

// Runs a backtracking search.
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
                                 const StringPiece& context,
                                 Anchor anchor,
                                 MatchKind kind,
                                 StringPiece* match,
                                 int nmatch) {
  // If full match, we ask for an anchored longest match
  // and then check that match[0] == text.
  // So make sure match[0] exists.
  StringPiece sp0;
  if (kind == kFullMatch) {
    anchor = kAnchored;
    if (nmatch < 1) {
      match = &sp0;
      nmatch = 1;
    }
  }

  // Run the search.
  Backtracker b(this);
  bool anchored = anchor == kAnchored;
  bool longest = kind != kFirstMatch;
  if (!b.Search(text, context, anchored, longest, match, nmatch))
    return false;
  if (kind == kFullMatch && match[0].end() != text.end())
    return false;
  return true;
}

}  // namespace re2