summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--api/next.txt5
-rw-r--r--src/pkg/regexp/exec.go8
-rw-r--r--src/pkg/regexp/onepass.go582
-rw-r--r--src/pkg/regexp/onepass_test.go208
-rw-r--r--src/pkg/regexp/regexp.go8
-rw-r--r--src/pkg/regexp/syntax/prog.go542
-rw-r--r--src/pkg/regexp/syntax/prog_test.go202
7 files changed, 804 insertions, 751 deletions
diff --git a/api/next.txt b/api/next.txt
index eac2d203d..cdbac57f2 100644
--- a/api/next.txt
+++ b/api/next.txt
@@ -330,12 +330,7 @@ pkg net/http, type Server struct, ConnState func(net.Conn, ConnState)
pkg net/http, type Server struct, ErrorLog *log.Logger
pkg net/http, type Transport struct, TLSHandshakeTimeout time.Duration
pkg regexp/syntax, method (*Inst) MatchRunePos(int32) int
-pkg regexp/syntax, method (*Inst) OnePassNext(int32) uint32
-pkg regexp/syntax, method (*Prog) CompileOnePass() *Prog
-pkg regexp/syntax, method (*Prog) OnePassPrefix() (string, bool, uint32)
pkg regexp/syntax, method (InstOp) String() string
-pkg regexp/syntax, type Inst struct, Next []uint32
-pkg regexp/syntax, var NotOnePass *Prog
pkg runtime/debug, func SetPanicOnFault(bool) bool
pkg runtime/debug, func WriteHeapDump(uintptr)
pkg sync, method (*Pool) Get() interface{}
diff --git a/src/pkg/regexp/exec.go b/src/pkg/regexp/exec.go
index 22c33cfa5..c4cb201f6 100644
--- a/src/pkg/regexp/exec.go
+++ b/src/pkg/regexp/exec.go
@@ -37,7 +37,7 @@ type thread struct {
type machine struct {
re *Regexp // corresponding Regexp
p *syntax.Prog // compiled program
- op *syntax.Prog // compiled onepass program, or syntax.NotOnePass
+ op *onePassProg // compiled onepass program, or notOnePass
q0, q1 queue // two queues for runq, nextq
pool []*thread // pool of available threads
matched bool // whether a match was found
@@ -67,7 +67,7 @@ func (m *machine) newInputReader(r io.RuneReader) input {
}
// progMachine returns a new machine running the prog p.
-func progMachine(p, op *syntax.Prog) *machine {
+func progMachine(p *syntax.Prog, op *onePassProg) *machine {
m := &machine{p: p, op: op}
n := len(m.p.Inst)
m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
@@ -382,7 +382,7 @@ func (m *machine) onepass(i input, pos int) bool {
}
// peek at the input rune to see which branch of the Alt to take
case syntax.InstAlt, syntax.InstAltMatch:
- pc = int(inst.OnePassNext(r))
+ pc = int(onePassNext(&inst, r))
continue
case syntax.InstFail:
return m.matched
@@ -429,7 +429,7 @@ func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap i
} else {
i = m.newInputString(s)
}
- if m.op != syntax.NotOnePass {
+ if m.op != notOnePass {
if !m.onepass(i, pos) {
re.put(m)
return nil
diff --git a/src/pkg/regexp/onepass.go b/src/pkg/regexp/onepass.go
new file mode 100644
index 000000000..501fb28af
--- /dev/null
+++ b/src/pkg/regexp/onepass.go
@@ -0,0 +1,582 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+
+package regexp
+
+import (
+ "bytes"
+ "regexp/syntax"
+ "sort"
+ "unicode"
+)
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// "One-pass" regexp execution.
+// Some regexps can be analyzed to determine that they never need
+// backtracking: they are guaranteed to run in one pass over the string
+// without bothering to save all the usual NFA state.
+// Detect those and execute them more quickly.
+
+// A onePassProg is a compiled one-pass regular expression program.
+// It is the same as syntax.Prog except for the use of onePassInst.
+type onePassProg struct {
+ Inst []onePassInst
+ Start int // index of start instruction
+ NumCap int // number of InstCapture insts in re
+}
+
+// A onePassInst is a single instruction in a one-pass regular expression program.
+// It is the same as syntax.Inst except for the new 'Next' field.
+type onePassInst struct {
+ syntax.Inst
+ Next []uint32
+}
+
+// OnePassPrefix returns a literal string that all matches for the
+// regexp must start with. Complete is true if the prefix
+// is the entire match. Pc is the index of the last rune instruction
+// in the string. The OnePassPrefix skips over the mandatory
+// EmptyBeginText
+func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
+ i := &p.Inst[p.Start]
+ if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
+ return "", i.Op == syntax.InstMatch, uint32(p.Start)
+ }
+ pc = i.Out
+ i = &p.Inst[pc]
+ for i.Op == syntax.InstNop {
+ pc = i.Out
+ i = &p.Inst[pc]
+ }
+ // Avoid allocation of buffer if prefix is empty.
+ if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
+ return "", i.Op == syntax.InstMatch, uint32(p.Start)
+ }
+
+ // Have prefix; gather characters.
+ var buf bytes.Buffer
+ for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
+ buf.WriteRune(i.Rune[0])
+ pc, i = i.Out, &p.Inst[i.Out]
+ }
+ return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
+}
+
+// OnePassNext selects the next actionable state of the prog, based on the input character.
+// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
+// One of the alternates may ultimately lead without input to end of line. If the instruction
+// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
+func onePassNext(i *onePassInst, r rune) uint32 {
+ next := i.MatchRunePos(r)
+ if next >= 0 {
+ return i.Next[next]
+ }
+ if i.Op == syntax.InstAltMatch {
+ return i.Out
+ }
+ return 0
+}
+
+func iop(i *syntax.Inst) syntax.InstOp {
+ op := i.Op
+ switch op {
+ case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+ op = syntax.InstRune
+ }
+ return op
+}
+
+// Sparse Array implementation is used as a queueOnePass.
+type queueOnePass struct {
+ sparse []uint32
+ dense []uint32
+ size, nextIndex uint32
+}
+
+func (q *queueOnePass) empty() bool {
+ return q.nextIndex >= q.size
+}
+
+func (q *queueOnePass) next() (n uint32) {
+ n = q.dense[q.nextIndex]
+ q.nextIndex++
+ return
+}
+
+func (q *queueOnePass) clear() {
+ q.size = 0
+ q.nextIndex = 0
+}
+
+func (q *queueOnePass) reset() {
+ q.nextIndex = 0
+}
+
+func (q *queueOnePass) contains(u uint32) bool {
+ if u >= uint32(len(q.sparse)) {
+ return false
+ }
+ return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
+}
+
+func (q *queueOnePass) insert(u uint32) {
+ if !q.contains(u) {
+ q.insertNew(u)
+ }
+}
+
+func (q *queueOnePass) insertNew(u uint32) {
+ if u >= uint32(len(q.sparse)) {
+ return
+ }
+ q.sparse[u] = q.size
+ q.dense[q.size] = u
+ q.size++
+}
+
+func newQueue(size int) (q *queueOnePass) {
+ return &queueOnePass{
+ sparse: make([]uint32, size),
+ dense: make([]uint32, size),
+ }
+}
+
+// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
+// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
+// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
+// NextIp array with the single element mergeFailed is returned.
+// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
+const mergeFailed = uint32(0xffffffff)
+
+var (
+ noRune = []rune{}
+ noNext = []uint32{mergeFailed}
+)
+
+func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
+ leftLen := len(*leftRunes)
+ rightLen := len(*rightRunes)
+ if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
+ panic("mergeRuneSets odd length []rune")
+ }
+ var (
+ lx, rx int
+ )
+ merged := make([]rune, 0)
+ next := make([]uint32, 0)
+ ok := true
+ defer func() {
+ if !ok {
+ merged = nil
+ next = nil
+ }
+ }()
+
+ ix := -1
+ extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
+ if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
+ return false
+ }
+ merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
+ *newLow += 2
+ ix += 2
+ next = append(next, pc)
+ return true
+ }
+
+ for lx < leftLen || rx < rightLen {
+ switch {
+ case rx >= rightLen:
+ ok = extend(&lx, leftRunes, leftPC)
+ case lx >= leftLen:
+ ok = extend(&rx, rightRunes, rightPC)
+ case (*rightRunes)[rx] < (*leftRunes)[lx]:
+ ok = extend(&rx, rightRunes, rightPC)
+ default:
+ ok = extend(&lx, leftRunes, leftPC)
+ }
+ if !ok {
+ return noRune, noNext
+ }
+ }
+ return merged, next
+}
+
+// cleanupOnePass drops working memory, and restores certain shortcut instructions.
+func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
+ for ix, instOriginal := range original.Inst {
+ switch instOriginal.Op {
+ case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
+ case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
+ prog.Inst[ix].Next = nil
+ case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+ prog.Inst[ix].Next = nil
+ prog.Inst[ix] = onePassInst{Inst: instOriginal}
+ }
+ }
+}
+
+// onePassCopy creates a copy of the original Prog, as we'll be modifying it
+func onePassCopy(prog *syntax.Prog) *onePassProg {
+ p := &onePassProg{
+ Start: prog.Start,
+ NumCap: prog.NumCap,
+ }
+ for _, inst := range prog.Inst {
+ p.Inst = append(p.Inst, onePassInst{Inst: inst})
+ }
+
+ // rewrites one or more common Prog constructs that enable some otherwise
+ // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
+ // ip A, that points to ips B & C.
+ // A:BC + B:DA => A:BC + B:CD
+ // A:BC + B:DC => A:DC + B:DC
+ for pc := range p.Inst {
+ switch p.Inst[pc].Op {
+ default:
+ continue
+ case syntax.InstAlt, syntax.InstAltMatch:
+ // A:Bx + B:Ay
+ p_A_Other := &p.Inst[pc].Out
+ p_A_Alt := &p.Inst[pc].Arg
+ // make sure a target is another Alt
+ instAlt := p.Inst[*p_A_Alt]
+ if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+ p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
+ instAlt = p.Inst[*p_A_Alt]
+ if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+ continue
+ }
+ }
+ instOther := p.Inst[*p_A_Other]
+ // Analyzing both legs pointing to Alts is for another day
+ if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
+ // too complicated
+ continue
+ }
+ // simple empty transition loop
+ // A:BC + B:DA => A:BC + B:DC
+ p_B_Alt := &p.Inst[*p_A_Alt].Out
+ p_B_Other := &p.Inst[*p_A_Alt].Arg
+ patch := false
+ if instAlt.Out == uint32(pc) {
+ patch = true
+ } else if instAlt.Arg == uint32(pc) {
+ patch = true
+ p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
+ }
+ if patch {
+ *p_B_Alt = *p_A_Other
+ }
+
+ // empty transition to common target
+ // A:BC + B:DC => A:DC + B:DC
+ if *p_A_Other == *p_B_Alt {
+ *p_A_Alt = *p_B_Other
+ }
+ }
+ }
+ return p
+}
+
+// runeSlice exists to permit sorting the case-folded rune sets.
+type runeSlice []rune
+
+func (p runeSlice) Len() int { return len(p) }
+func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
+func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
+
+// Sort is a convenience method.
+func (p runeSlice) Sort() {
+ sort.Sort(p)
+}
+
+var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
+var anyRune = []rune{0, unicode.MaxRune}
+
+// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
+// the match engine can always tell which branch to take. The routine may modify
+// p if it is turned into a onepass Prog. If it isn't possible for this to be a
+// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
+// to the size of the Prog.
+func makeOnePass(p *onePassProg) *onePassProg {
+ // If the machine is very long, it's not worth the time to check if we can use one pass.
+ if len(p.Inst) >= 1000 {
+ return notOnePass
+ }
+
+ var (
+ instQueue = newQueue(len(p.Inst))
+ visitQueue = newQueue(len(p.Inst))
+ build func(uint32, *queueOnePass)
+ check func(uint32, map[uint32]bool) bool
+ onePassRunes = make([][]rune, len(p.Inst))
+ )
+ build = func(pc uint32, q *queueOnePass) {
+ if q.contains(pc) {
+ return
+ }
+ inst := p.Inst[pc]
+ switch inst.Op {
+ case syntax.InstAlt, syntax.InstAltMatch:
+ q.insert(inst.Out)
+ build(inst.Out, q)
+ q.insert(inst.Arg)
+ case syntax.InstMatch, syntax.InstFail:
+ default:
+ q.insert(inst.Out)
+ }
+ }
+
+ // check that paths from Alt instructions are unambiguous, and rebuild the new
+ // program as a onepass program
+ check = func(pc uint32, m map[uint32]bool) (ok bool) {
+ ok = true
+ inst := &p.Inst[pc]
+ if visitQueue.contains(pc) {
+ return
+ }
+ visitQueue.insert(pc)
+ switch inst.Op {
+ case syntax.InstAlt, syntax.InstAltMatch:
+ ok = check(inst.Out, m) && check(inst.Arg, m)
+ // check no-input paths to InstMatch
+ matchOut := m[inst.Out]
+ matchArg := m[inst.Arg]
+ if matchOut && matchArg {
+ ok = false
+ break
+ }
+ // Match on empty goes in inst.Out
+ if matchArg {
+ inst.Out, inst.Arg = inst.Arg, inst.Out
+ matchOut, matchArg = matchArg, matchOut
+ }
+ if matchOut {
+ m[pc] = true
+ inst.Op = syntax.InstAltMatch
+ }
+
+ // build a dispatch operator from the two legs of the alt.
+ onePassRunes[pc], inst.Next = mergeRuneSets(
+ &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
+ if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
+ ok = false
+ break
+ }
+ case syntax.InstCapture, syntax.InstNop:
+ ok = check(inst.Out, m)
+ m[pc] = m[inst.Out]
+ // pass matching runes back through these no-ops.
+ onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+ inst.Next = []uint32{}
+ for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+ inst.Next = append(inst.Next, inst.Out)
+ }
+ case syntax.InstEmptyWidth:
+ ok = check(inst.Out, m)
+ m[pc] = m[inst.Out]
+ onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+ inst.Next = []uint32{}
+ for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+ inst.Next = append(inst.Next, inst.Out)
+ }
+ case syntax.InstMatch, syntax.InstFail:
+ m[pc] = inst.Op == syntax.InstMatch
+ break
+ case syntax.InstRune:
+ ok = check(inst.Out, m)
+ m[pc] = false
+ if len(inst.Next) > 0 {
+ break
+ }
+ if len(inst.Rune) == 0 {
+ onePassRunes[pc] = []rune{}
+ inst.Next = []uint32{inst.Out}
+ break
+ }
+ runes := make([]rune, 0)
+ if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+ r0 := inst.Rune[0]
+ runes = append(runes, r0, r0)
+ for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+ runes = append(runes, r1, r1)
+ }
+ sort.Sort(runeSlice(runes))
+ } else {
+ runes = append(runes, inst.Rune...)
+ }
+ onePassRunes[pc] = runes
+ inst.Next = []uint32{}
+ for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+ inst.Next = append(inst.Next, inst.Out)
+ }
+ inst.Op = syntax.InstRune
+ case syntax.InstRune1:
+ ok = check(inst.Out, m)
+ m[pc] = false
+ if len(inst.Next) > 0 {
+ break
+ }
+ runes := []rune{}
+ // expand case-folded runes
+ if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+ r0 := inst.Rune[0]
+ runes = append(runes, r0, r0)
+ for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+ runes = append(runes, r1, r1)
+ }
+ sort.Sort(runeSlice(runes))
+ } else {
+ runes = append(runes, inst.Rune[0], inst.Rune[0])
+ }
+ onePassRunes[pc] = runes
+ inst.Next = []uint32{}
+ for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+ inst.Next = append(inst.Next, inst.Out)
+ }
+ inst.Op = syntax.InstRune
+ case syntax.InstRuneAny:
+ ok = check(inst.Out, m)
+ m[pc] = false
+ if len(inst.Next) > 0 {
+ break
+ }
+ onePassRunes[pc] = append([]rune{}, anyRune...)
+ inst.Next = []uint32{inst.Out}
+ case syntax.InstRuneAnyNotNL:
+ ok = check(inst.Out, m)
+ m[pc] = false
+ if len(inst.Next) > 0 {
+ break
+ }
+ onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
+ inst.Next = []uint32{}
+ for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+ inst.Next = append(inst.Next, inst.Out)
+ }
+ }
+ return
+ }
+
+ instQueue.clear()
+ instQueue.insert(uint32(p.Start))
+ m := make(map[uint32]bool, len(p.Inst))
+ for !instQueue.empty() {
+ pc := instQueue.next()
+ inst := p.Inst[pc]
+ visitQueue.clear()
+ if !check(uint32(pc), m) {
+ p = notOnePass
+ break
+ }
+ switch inst.Op {
+ case syntax.InstAlt, syntax.InstAltMatch:
+ instQueue.insert(inst.Out)
+ instQueue.insert(inst.Arg)
+ case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
+ instQueue.insert(inst.Out)
+ case syntax.InstMatch:
+ case syntax.InstFail:
+ case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+ default:
+ }
+ }
+ if p != notOnePass {
+ for i, _ := range p.Inst {
+ p.Inst[i].Rune = onePassRunes[i]
+ }
+ }
+ return p
+}
+
+// walk visits each Inst in the prog once, and applies the argument
+// function(ip, next), in pre-order.
+func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
+ var walk1 func(uint32)
+ progQueue := newQueue(len(prog.Inst))
+ walk1 = func(ip uint32) {
+ if progQueue.contains(ip) {
+ return
+ }
+ progQueue.insert(ip)
+ inst := prog.Inst[ip]
+ switch inst.Op {
+ case syntax.InstAlt, syntax.InstAltMatch:
+ for _, f := range funcs {
+ f(ip, inst.Out)
+ f(ip, inst.Arg)
+ }
+ walk1(inst.Out)
+ walk1(inst.Arg)
+ default:
+ for _, f := range funcs {
+ f(ip, inst.Out)
+ }
+ walk1(inst.Out)
+ }
+ }
+ walk1(uint32(prog.Start))
+}
+
+// find returns the Insts that match the argument predicate function
+func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
+ matches = []uint32{}
+
+ for ip := range prog.Inst {
+ if f(prog, ip) {
+ matches = append(matches, uint32(ip))
+ }
+ }
+ return
+}
+
+var notOnePass *onePassProg = nil
+
+// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
+// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
+// Prog cannot be converted. For a one pass prog, the fundamental condition that must
+// be true is: at any InstAlt, there must be no ambiguity about what branch to take.
+func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
+ if prog.Start == 0 {
+ return notOnePass
+ }
+ // onepass regexp is anchored
+ if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
+ syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
+ return notOnePass
+ }
+ // every instruction leading to InstMatch must be EmptyEndText
+ for _, inst := range prog.Inst {
+ opOut := prog.Inst[inst.Out].Op
+ switch inst.Op {
+ default:
+ if opOut == syntax.InstMatch {
+ return notOnePass
+ }
+ case syntax.InstAlt, syntax.InstAltMatch:
+ if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
+ return notOnePass
+ }
+ case syntax.InstEmptyWidth:
+ if opOut == syntax.InstMatch {
+ if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
+ continue
+ }
+ return notOnePass
+ }
+ }
+ }
+ // Creates a slightly optimized copy of the original Prog
+ // that cleans up some Prog idioms that block valid onepass programs
+ p = onePassCopy(prog)
+
+ // checkAmbiguity on InstAlts, build onepass Prog if possible
+ p = makeOnePass(p)
+
+ if p != notOnePass {
+ cleanupOnePass(p, prog)
+ }
+ return p
+}
diff --git a/src/pkg/regexp/onepass_test.go b/src/pkg/regexp/onepass_test.go
new file mode 100644
index 000000000..7b2beea67
--- /dev/null
+++ b/src/pkg/regexp/onepass_test.go
@@ -0,0 +1,208 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package regexp
+
+import (
+ "reflect"
+ "regexp/syntax"
+ "testing"
+)
+
+var runeMergeTests = []struct {
+ left, right, merged []rune
+ next []uint32
+ leftPC, rightPC uint32
+}{
+ {
+ // empty rhs
+ []rune{69, 69},
+ []rune{},
+ []rune{69, 69},
+ []uint32{1},
+ 1, 2,
+ },
+ {
+ // identical runes, identical targets
+ []rune{69, 69},
+ []rune{69, 69},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 1,
+ },
+ {
+ // identical runes, different targets
+ []rune{69, 69},
+ []rune{69, 69},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // append right-first
+ []rune{69, 69},
+ []rune{71, 71},
+ []rune{69, 69, 71, 71},
+ []uint32{1, 2},
+ 1, 2,
+ },
+ {
+ // append, left-first
+ []rune{71, 71},
+ []rune{69, 69},
+ []rune{69, 69, 71, 71},
+ []uint32{2, 1},
+ 1, 2,
+ },
+ {
+ // successful interleave
+ []rune{60, 60, 71, 71, 101, 101},
+ []rune{69, 69, 88, 88},
+ []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
+ []uint32{1, 2, 1, 2, 1},
+ 1, 2,
+ },
+ {
+ // left surrounds right
+ []rune{69, 74},
+ []rune{71, 71},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // right surrounds left
+ []rune{69, 74},
+ []rune{68, 75},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // overlap at interval begin
+ []rune{69, 74},
+ []rune{74, 75},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // overlap ar interval end
+ []rune{69, 74},
+ []rune{65, 69},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // overlap from above
+ []rune{69, 74},
+ []rune{71, 74},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // overlap from below
+ []rune{69, 74},
+ []rune{65, 71},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+ {
+ // out of order []rune
+ []rune{69, 74, 60, 65},
+ []rune{66, 67},
+ []rune{},
+ []uint32{mergeFailed},
+ 1, 2,
+ },
+}
+
+func TestMergeRuneSet(t *testing.T) {
+ for ix, test := range runeMergeTests {
+ merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
+ if !reflect.DeepEqual(merged, test.merged) {
+ t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
+ }
+ if !reflect.DeepEqual(next, test.next) {
+ t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
+ }
+ }
+}
+
+const noStr = `!`
+
+var onePass = &onePassProg{}
+
+var onePassTests = []struct {
+ re string
+ onePass *onePassProg
+ prog string
+}{
+ {`^(?:a|(?:a*))$`, notOnePass, noStr},
+ {`^(?:(a)|(?:a*))$`, notOnePass, noStr},
+ {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`},
+ {`^abcd$`, onePass, `abcd`},
+ {`^abcd$`, onePass, `abcde`},
+ {`^(?:(?:a{0,})*?)$`, onePass, `a`},
+ {`^(?:(?:a+)*)$`, onePass, ``},
+ {`^(?:(?:a|(?:aa)))$`, onePass, ``},
+ {`^(?:[^\s\S])$`, onePass, ``},
+ {`^(?:(?:a{3,4}){0,})$`, notOnePass, `aaaaaa`},
+ {`^(?:(?:a+)*)$`, onePass, `a`},
+ {`^(?:(?:(?:a*)+))$`, onePass, noStr},
+ {`^(?:(?:a+)*)$`, onePass, ``},
+ {`^[a-c]+$`, onePass, `abc`},
+ {`^[a-c]*$`, onePass, `abcdabc`},
+ {`^(?:a*)$`, onePass, `aaaaaaa`},
+ {`^(?:(?:aa)|a)$`, onePass, `a`},
+ {`^[a-c]*`, notOnePass, `abcdabc`},
+ {`^[a-c]*$`, onePass, `abc`},
+ {`^...$`, onePass, ``},
+ {`^(?:a|(?:aa))$`, onePass, `a`},
+ {`^[a-c]*`, notOnePass, `abcabc`},
+ {`^a((b))c$`, onePass, noStr},
+ {`^a.[l-nA-Cg-j]?e$`, onePass, noStr},
+ {`^a((b))$`, onePass, noStr},
+ {`^a(?:(b)|(c))c$`, onePass, noStr},
+ {`^a(?:(b*)|(c))c$`, notOnePass, noStr},
+ {`^a(?:b|c)$`, onePass, noStr},
+ {`^a(?:b?|c)$`, onePass, noStr},
+ {`^a(?:b?|c?)$`, notOnePass, noStr},
+ {`^a(?:b?|c+)$`, onePass, noStr},
+ {`^a(?:b+|(bc))d$`, notOnePass, noStr},
+ {`^a(?:bc)+$`, onePass, noStr},
+ {`^a(?:[bcd])+$`, onePass, noStr},
+ {`^a((?:[bcd])+)$`, onePass, noStr},
+ {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`},
+ {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`},
+ {`^(?:(?:aa)|.)$`, notOnePass, `a`},
+ {`^(?:(?:a{1,2}){1,2})$`, notOnePass, `aaaa`},
+}
+
+func TestCompileOnePass(t *testing.T) {
+ var (
+ p *syntax.Prog
+ re *syntax.Regexp
+ err error
+ )
+ for _, test := range onePassTests {
+ if re, err = syntax.Parse(test.re, syntax.Perl); err != nil {
+ t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
+ continue
+ }
+ // needs to be done before compile...
+ re = re.Simplify()
+ if p, err = syntax.Compile(re); err != nil {
+ t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
+ continue
+ }
+ onePass = compileOnePass(p)
+ if (onePass == notOnePass) != (test.onePass == notOnePass) {
+ t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass)
+ }
+ }
+}
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go
index cca21e750..0b8336a04 100644
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -83,7 +83,7 @@ type Regexp struct {
// read-only after Compile
expr string // as passed to Compile
prog *syntax.Prog // compiled program
- onepass *syntax.Prog // onpass program or nil
+ onepass *onePassProg // onpass program or nil
prefix string // required prefix in unanchored matches
prefixBytes []byte // prefix, as a []byte
prefixComplete bool // prefix is the entire regexp
@@ -165,16 +165,16 @@ func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
regexp := &Regexp{
expr: expr,
prog: prog,
- onepass: prog.CompileOnePass(),
+ onepass: compileOnePass(prog),
numSubexp: maxCap,
subexpNames: capNames,
cond: prog.StartCond(),
longest: longest,
}
- if regexp.onepass == syntax.NotOnePass {
+ if regexp.onepass == notOnePass {
regexp.prefix, regexp.prefixComplete = prog.Prefix()
} else {
- regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = prog.OnePassPrefix()
+ regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
}
if regexp.prefix != "" {
// TODO(rsc): Remove this allocation by adding
diff --git a/src/pkg/regexp/syntax/prog.go b/src/pkg/regexp/syntax/prog.go
index 089b90db1..29bd282d0 100644
--- a/src/pkg/regexp/syntax/prog.go
+++ b/src/pkg/regexp/syntax/prog.go
@@ -6,7 +6,6 @@ package syntax
import (
"bytes"
- "sort"
"strconv"
"unicode"
)
@@ -115,7 +114,6 @@ type Inst struct {
Out uint32 // all but InstMatch, InstFail
Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune []rune
- Next []uint32 // If input rune matches
}
func (p *Prog) String() string {
@@ -165,36 +163,6 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
return buf.String(), i.Op == InstMatch
}
-// OnePassPrefix returns a literal string that all matches for the
-// regexp must start with. Complete is true if the prefix
-// is the entire match. Pc is the index of the last rune instruction
-// in the string. The OnePassPrefix skips over the mandatory
-// EmptyBeginText
-func (p *Prog) OnePassPrefix() (prefix string, complete bool, pc uint32) {
- i := &p.Inst[p.Start]
- if i.Op != InstEmptyWidth || (EmptyOp(i.Arg))&EmptyBeginText == 0 {
- return "", i.Op == InstMatch, uint32(p.Start)
- }
- pc = i.Out
- i = &p.Inst[pc]
- for i.Op == InstNop {
- pc = i.Out
- i = &p.Inst[pc]
- }
- // Avoid allocation of buffer if prefix is empty.
- if i.op() != InstRune || len(i.Rune) != 1 {
- return "", i.Op == InstMatch, uint32(p.Start)
- }
-
- // Have prefix; gather characters.
- var buf bytes.Buffer
- for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
- buf.WriteRune(i.Rune[0])
- pc, i = i.Out, &p.Inst[i.Out]
- }
- return buf.String(), i.Op == InstEmptyWidth && (EmptyOp(i.Arg))&EmptyBeginText != 0, pc
-}
-
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
@@ -221,29 +189,17 @@ Loop:
const noMatch = -1
-// OnePassNext selects the next actionable state of the prog, based on the input character.
-// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
-// One of the alternates may ultimately lead without input to end of line. If the instruction
-// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
-func (i *Inst) OnePassNext(r rune) uint32 {
- next := i.MatchRunePos(r)
- if next != noMatch {
- return i.Next[next]
- }
- if i.Op == InstAltMatch {
- return i.Out
- }
- return 0
-}
-
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r rune) bool {
return i.MatchRunePos(r) != noMatch
}
-// MatchRunePos returns the index of the rune pair if the instruction matches.
-// It should only be called when i.Op == InstRune.
+// MatchRunePos checks whether the instruction matches (and consumes) r.
+// If so, MatchRunePos returns the index of the matching rune pair
+// (or, when len(i.Rune) == 1, rune singleton).
+// If not, MatchRunePos returns -1.
+// MatchRunePos should only be called when i.Op == InstRune.
func (i *Inst) MatchRunePos(r rune) int {
rune := i.Rune
@@ -387,491 +343,3 @@ func dumpInst(b *bytes.Buffer, i *Inst) {
bw(b, "anynotnl -> ", u32(i.Out))
}
}
-
-// Sparse Array implementation is used as a queue.
-type queue struct {
- sparse []uint32
- dense []uint32
- size, nextIndex uint32
-}
-
-func (q *queue) empty() bool {
- return q.nextIndex >= q.size
-}
-
-func (q *queue) next() (n uint32) {
- n = q.dense[q.nextIndex]
- q.nextIndex++
- return
-}
-
-func (q *queue) clear() {
- q.size = 0
- q.nextIndex = 0
-}
-
-func (q *queue) reset() {
- q.nextIndex = 0
-}
-
-func (q *queue) contains(u uint32) bool {
- if u >= uint32(len(q.sparse)) {
- return false
- }
- return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
-}
-
-func (q *queue) insert(u uint32) {
- if !q.contains(u) {
- q.insertNew(u)
- }
-}
-
-func (q *queue) insertNew(u uint32) {
- if u >= uint32(len(q.sparse)) {
- return
- }
- q.sparse[u] = q.size
- q.dense[q.size] = u
- q.size++
-}
-
-func newQueue(size int) (q *queue) {
- return &queue{
- sparse: make([]uint32, size),
- dense: make([]uint32, size),
- }
-}
-
-// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
-// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
-// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
-// NextIp array with the single element mergeFailed is returned.
-// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
-const mergeFailed = uint32(0xffffffff)
-
-var (
- noRune = []rune{}
- noNext = []uint32{mergeFailed}
-)
-
-func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
- leftLen := len(*leftRunes)
- rightLen := len(*rightRunes)
- if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
- panic("mergeRuneSets odd length []rune")
- }
- var (
- lx, rx int
- )
- merged := make([]rune, 0)
- next := make([]uint32, 0)
- ok := true
- defer func() {
- if !ok {
- merged = nil
- next = nil
- }
- }()
-
- ix := -1
- extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
- if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
- return false
- }
- merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
- *newLow += 2
- ix += 2
- next = append(next, pc)
- return true
- }
-
- for lx < leftLen || rx < rightLen {
- switch {
- case rx >= rightLen:
- ok = extend(&lx, leftRunes, leftPC)
- case lx >= leftLen:
- ok = extend(&rx, rightRunes, rightPC)
- case (*rightRunes)[rx] < (*leftRunes)[lx]:
- ok = extend(&rx, rightRunes, rightPC)
- default:
- ok = extend(&lx, leftRunes, leftPC)
- }
- if !ok {
- return noRune, noNext
- }
- }
- return merged, next
-}
-
-// cleanupOnePass drops working memory, and restores certain shortcut instructions.
-func (prog *Prog) cleanupOnePass(pOriginal *Prog) {
- for ix, instOriginal := range pOriginal.Inst {
- switch instOriginal.Op {
- case InstAlt, InstAltMatch, InstRune:
- case InstCapture, InstEmptyWidth, InstNop, InstMatch, InstFail:
- prog.Inst[ix].Next = nil
- case InstRune1, InstRuneAny, InstRuneAnyNotNL:
- prog.Inst[ix].Next = nil
- prog.Inst[ix] = instOriginal
- }
- }
-}
-
-// onePassCopy creates a copy of the original Prog, as we'll be modifying it
-func (prog *Prog) onePassCopy() *Prog {
- p := &Prog{
- Inst: append([]Inst{}[:], prog.Inst...),
- Start: prog.Start,
- NumCap: prog.NumCap,
- }
- for _, inst := range p.Inst {
- inst.Next = make([]uint32, 0)
- }
-
- // rewrites one or more common Prog constructs that enable some otherwise
- // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
- // ip A, that points to ips B & C.
- // A:BC + B:DA => A:BC + B:CD
- // A:BC + B:DC => A:DC + B:DC
- for pc := range p.Inst {
- switch p.Inst[pc].Op {
- default:
- continue
- case InstAlt, InstAltMatch:
- // A:Bx + B:Ay
- p_A_Other := &p.Inst[pc].Out
- p_A_Alt := &p.Inst[pc].Arg
- // make sure a target is another Alt
- instAlt := p.Inst[*p_A_Alt]
- if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) {
- p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
- instAlt = p.Inst[*p_A_Alt]
- if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) {
- continue
- }
- }
- instOther := p.Inst[*p_A_Other]
- // Analyzing both legs pointing to Alts is for another day
- if instOther.Op == InstAlt || instOther.Op == InstAltMatch {
- // too complicated
- continue
- }
- // simple empty transition loop
- // A:BC + B:DA => A:BC + B:DC
- p_B_Alt := &p.Inst[*p_A_Alt].Out
- p_B_Other := &p.Inst[*p_A_Alt].Arg
- patch := false
- if instAlt.Out == uint32(pc) {
- patch = true
- } else if instAlt.Arg == uint32(pc) {
- patch = true
- p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
- }
- if patch {
- *p_B_Alt = *p_A_Other
- }
-
- // empty transition to common target
- // A:BC + B:DC => A:DC + B:DC
- if *p_A_Other == *p_B_Alt {
- *p_A_Alt = *p_B_Other
- }
- }
- }
- return p
-}
-
-// runeSlice exists to permit sorting the case-folded rune sets.
-type runeSlice []rune
-
-func (p runeSlice) Len() int { return len(p) }
-func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
-func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
-
-// Sort is a convenience method.
-func (p runeSlice) Sort() {
- sort.Sort(p)
-}
-
-// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
-// the match engine can always tell which branch to take. The routine may modify
-// p if it is turned into a onepass Prog. If it isn't possible for this to be a
-// onepass Prog, the Prog syntax.NotOnePass is returned. makeOnePass is recursive
-// to the size of the Prog
-func (p *Prog) makeOnePass() *Prog {
- var (
- instQueue = newQueue(len(p.Inst))
- visitQueue = newQueue(len(p.Inst))
- build func(uint32, *queue)
- check func(uint32, map[uint32]bool) bool
- onePassRunes = make([][]rune, len(p.Inst))
- )
- build = func(pc uint32, q *queue) {
- if q.contains(pc) {
- return
- }
- inst := p.Inst[pc]
- switch inst.Op {
- case InstAlt, InstAltMatch:
- q.insert(inst.Out)
- build(inst.Out, q)
- q.insert(inst.Arg)
- case InstMatch, InstFail:
- default:
- q.insert(inst.Out)
- }
- }
-
- // check that paths from Alt instructions are unambiguous, and rebuild the new
- // program as a onepass program
- check = func(pc uint32, m map[uint32]bool) (ok bool) {
- ok = true
- inst := &p.Inst[pc]
- if visitQueue.contains(pc) {
- return
- }
- visitQueue.insert(pc)
- switch inst.Op {
- case InstAlt, InstAltMatch:
- ok = check(inst.Out, m) && check(inst.Arg, m)
- // check no-input paths to InstMatch
- matchOut := m[inst.Out]
- matchArg := m[inst.Arg]
- if matchOut && matchArg {
- ok = false
- break
- }
- // Match on empty goes in inst.Out
- if matchArg {
- inst.Out, inst.Arg = inst.Arg, inst.Out
- matchOut, matchArg = matchArg, matchOut
- }
- if matchOut {
- m[pc] = true
- inst.Op = InstAltMatch
- }
-
- // build a dispatch operator from the two legs of the alt.
- onePassRunes[pc], inst.Next = mergeRuneSets(
- &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
- if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
- ok = false
- break
- }
- case InstCapture, InstNop:
- ok = check(inst.Out, m)
- m[pc] = m[inst.Out]
- // pass matching runes back through these no-ops.
- onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
- inst.Next = []uint32{}
- for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
- inst.Next = append(inst.Next, inst.Out)
- }
- case InstEmptyWidth:
- ok = check(inst.Out, m)
- m[pc] = m[inst.Out]
- onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
- inst.Next = []uint32{}
- for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
- inst.Next = append(inst.Next, inst.Out)
- }
- case InstMatch, InstFail:
- m[pc] = inst.Op == InstMatch
- break
- case InstRune:
- ok = check(inst.Out, m)
- m[pc] = false
- if len(inst.Next) > 0 {
- break
- }
- if len(inst.Rune) == 0 {
- onePassRunes[pc] = []rune{}[:]
- inst.Next = []uint32{inst.Out}
- break
- }
- runes := make([]rune, 0)
- if len(inst.Rune) == 1 && Flags(inst.Arg)&FoldCase != 0 {
- r0 := inst.Rune[0]
- runes = append(runes, r0, r0)
- for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
- runes = append(runes, r1, r1)
- }
- sort.Sort(runeSlice(runes))
- } else {
- runes = append(runes, inst.Rune...)
- }
- onePassRunes[pc] = runes
- inst.Next = []uint32{}
- for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
- inst.Next = append(inst.Next, inst.Out)
- }
- inst.Op = InstRune
- case InstRune1:
- ok = check(inst.Out, m)
- m[pc] = false
- if len(inst.Next) > 0 {
- break
- }
- runes := []rune{}[:]
- // expand case-folded runes
- if Flags(inst.Arg)&FoldCase != 0 {
- r0 := inst.Rune[0]
- runes = append(runes, r0, r0)
- for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
- runes = append(runes, r1, r1)
- }
- sort.Sort(runeSlice(runes))
- } else {
- runes = append(runes, inst.Rune[0], inst.Rune[0])
- }
- onePassRunes[pc] = runes
- inst.Next = []uint32{}
- for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
- inst.Next = append(inst.Next, inst.Out)
- }
- inst.Op = InstRune
- case InstRuneAny:
- ok = check(inst.Out, m)
- m[pc] = false
- if len(inst.Next) > 0 {
- break
- }
- onePassRunes[pc] = append([]rune{}[:], anyRune[:]...)
- inst.Next = []uint32{inst.Out}
- case InstRuneAnyNotNL:
- ok = check(inst.Out, m)
- m[pc] = false
- if len(inst.Next) > 0 {
- break
- }
- onePassRunes[pc] = append([]rune{}[:], anyRuneNotNL[:]...)
- inst.Next = []uint32{}
- for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
- inst.Next = append(inst.Next, inst.Out)
- }
- }
- return
- }
-
- instQueue.clear()
- instQueue.insert(uint32(p.Start))
- m := make(map[uint32]bool, len(p.Inst))
- for !instQueue.empty() {
- pc := instQueue.next()
- inst := p.Inst[pc]
- visitQueue.clear()
- if !check(uint32(pc), m) {
- p = NotOnePass
- break
- }
- switch inst.Op {
- case InstAlt, InstAltMatch:
- instQueue.insert(inst.Out)
- instQueue.insert(inst.Arg)
- case InstCapture, InstEmptyWidth, InstNop:
- instQueue.insert(inst.Out)
- case InstMatch:
- case InstFail:
- case InstRune, InstRune1, InstRuneAny, InstRuneAnyNotNL:
- default:
- }
- }
- if p != NotOnePass {
- for i, _ := range p.Inst {
- p.Inst[i].Rune = onePassRunes[i][:]
- }
- }
- return p
-}
-
-// walk visits each Inst in the prog once, and applies the argument
-// function(ip, next), in pre-order.
-func (prog *Prog) walk(funcs ...func(ip, next uint32)) {
- var walk1 func(uint32)
- progQueue := newQueue(len(prog.Inst))
- walk1 = func(ip uint32) {
- if progQueue.contains(ip) {
- return
- }
- progQueue.insert(ip)
- inst := prog.Inst[ip]
- switch inst.Op {
- case InstAlt, InstAltMatch:
- for _, f := range funcs {
- f(ip, inst.Out)
- f(ip, inst.Arg)
- }
- walk1(inst.Out)
- walk1(inst.Arg)
- default:
- for _, f := range funcs {
- f(ip, inst.Out)
- }
- walk1(inst.Out)
- }
- }
- walk1(uint32(prog.Start))
-}
-
-// find returns the Insts that match the argument predicate function
-func (prog *Prog) find(f func(*Prog, int) bool) (matches []uint32) {
- matches = []uint32{}
-
- for ip := range prog.Inst {
- if f(prog, ip) {
- matches = append(matches, uint32(ip))
- }
- }
- return
-}
-
-var NotOnePass *Prog = nil
-var debug = false
-
-// CompileOnePass returns a new *Prog suitable for onePass execution if the original Prog
-// can be recharacterized as a one-pass regexp program, or syntax.NotOnePass if the
-// Prog cannot be converted. For a one pass prog, the fundamental condition that must
-// be true is: at any InstAlt, there must be no ambiguity about what branch to take.
-func (prog *Prog) CompileOnePass() (p *Prog) {
- if prog.Start == 0 {
- return NotOnePass
- }
- // onepass regexp is anchored
- if prog.Inst[prog.Start].Op != InstEmptyWidth ||
- EmptyOp(prog.Inst[prog.Start].Arg)&EmptyBeginText != EmptyBeginText {
- return NotOnePass
- }
- // every instruction leading to InstMatch must be EmptyEndText
- for _, inst := range prog.Inst {
- opOut := prog.Inst[inst.Out].Op
- switch inst.Op {
- default:
- if opOut == InstMatch {
- return NotOnePass
- }
- case InstAlt, InstAltMatch:
- if opOut == InstMatch || prog.Inst[inst.Arg].Op == InstMatch {
- return NotOnePass
- }
- case InstEmptyWidth:
- if opOut == InstMatch {
- if EmptyOp(inst.Arg)&EmptyEndText == EmptyEndText {
- continue
- }
- return NotOnePass
- }
- }
- }
- // Creates a slightly optimized copy of the original Prog
- // that cleans up some Prog idioms that block valid onepass programs
- p = prog.onePassCopy()
-
- // checkAmbiguity on InstAlts, build onepass Prog if possible
- p = p.makeOnePass()
-
- if p != NotOnePass {
- p.cleanupOnePass(prog)
- }
- return p
-}
diff --git a/src/pkg/regexp/syntax/prog_test.go b/src/pkg/regexp/syntax/prog_test.go
index 66beb7435..50bfa3d4b 100644
--- a/src/pkg/regexp/syntax/prog_test.go
+++ b/src/pkg/regexp/syntax/prog_test.go
@@ -4,10 +4,7 @@
package syntax
-import (
- "reflect"
- "testing"
-)
+import "testing"
var compileTests = []struct {
Regexp string
@@ -115,200 +112,3 @@ func BenchmarkEmptyOpContext(b *testing.B) {
EmptyOpContext(r1, -1)
}
}
-
-var runeMergeTests = []struct {
- left, right, merged []rune
- next []uint32
- leftPC, rightPC uint32
-}{
- {
- // empty rhs
- []rune{69, 69},
- []rune{},
- []rune{69, 69},
- []uint32{1},
- 1, 2,
- },
- {
- // identical runes, identical targets
- []rune{69, 69},
- []rune{69, 69},
- []rune{},
- []uint32{mergeFailed},
- 1, 1,
- },
- {
- // identical runes, different targets
- []rune{69, 69},
- []rune{69, 69},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // append right-first
- []rune{69, 69},
- []rune{71, 71},
- []rune{69, 69, 71, 71},
- []uint32{1, 2},
- 1, 2,
- },
- {
- // append, left-first
- []rune{71, 71},
- []rune{69, 69},
- []rune{69, 69, 71, 71},
- []uint32{2, 1},
- 1, 2,
- },
- {
- // successful interleave
- []rune{60, 60, 71, 71, 101, 101},
- []rune{69, 69, 88, 88},
- []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
- []uint32{1, 2, 1, 2, 1},
- 1, 2,
- },
- {
- // left surrounds right
- []rune{69, 74},
- []rune{71, 71},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // right surrounds left
- []rune{69, 74},
- []rune{68, 75},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // overlap at interval begin
- []rune{69, 74},
- []rune{74, 75},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // overlap ar interval end
- []rune{69, 74},
- []rune{65, 69},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // overlap from above
- []rune{69, 74},
- []rune{71, 74},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // overlap from below
- []rune{69, 74},
- []rune{65, 71},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
- {
- // out of order []rune
- []rune{69, 74, 60, 65},
- []rune{66, 67},
- []rune{},
- []uint32{mergeFailed},
- 1, 2,
- },
-}
-
-func TestMergeRuneSet(t *testing.T) {
- for ix, test := range runeMergeTests {
- merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
- if !reflect.DeepEqual(merged, test.merged) {
- t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
- }
- if !reflect.DeepEqual(next, test.next) {
- t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
- }
- }
-}
-
-const noStr = `!`
-
-var onePass = &Prog{}
-
-var onePassTests = []struct {
- re string
- onePass *Prog
- prog string
-}{
- {`^(?:a|(?:a*))$`, NotOnePass, noStr},
- {`^(?:(a)|(?:a*))$`, NotOnePass, noStr},
- {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`},
- {`^abcd$`, onePass, `abcd`},
- {`^abcd$`, onePass, `abcde`},
- {`^(?:(?:a{0,})*?)$`, onePass, `a`},
- {`^(?:(?:a+)*)$`, onePass, ``},
- {`^(?:(?:a|(?:aa)))$`, onePass, ``},
- {`^(?:[^\s\S])$`, onePass, ``},
- {`^(?:(?:a{3,4}){0,})$`, NotOnePass, `aaaaaa`},
- {`^(?:(?:a+)*)$`, onePass, `a`},
- {`^(?:(?:(?:a*)+))$`, onePass, noStr},
- {`^(?:(?:a+)*)$`, onePass, ``},
- {`^[a-c]+$`, onePass, `abc`},
- {`^[a-c]*$`, onePass, `abcdabc`},
- {`^(?:a*)$`, onePass, `aaaaaaa`},
- {`^(?:(?:aa)|a)$`, onePass, `a`},
- {`^[a-c]*`, NotOnePass, `abcdabc`},
- {`^[a-c]*$`, onePass, `abc`},
- {`^...$`, onePass, ``},
- {`^(?:a|(?:aa))$`, onePass, `a`},
- {`^[a-c]*`, NotOnePass, `abcabc`},
- {`^a((b))c$`, onePass, noStr},
- {`^a.[l-nA-Cg-j]?e$`, onePass, noStr},
- {`^a((b))$`, onePass, noStr},
- {`^a(?:(b)|(c))c$`, onePass, noStr},
- {`^a(?:(b*)|(c))c$`, NotOnePass, noStr},
- {`^a(?:b|c)$`, onePass, noStr},
- {`^a(?:b?|c)$`, onePass, noStr},
- {`^a(?:b?|c?)$`, NotOnePass, noStr},
- {`^a(?:b?|c+)$`, onePass, noStr},
- {`^a(?:b+|(bc))d$`, NotOnePass, noStr},
- {`^a(?:bc)+$`, onePass, noStr},
- {`^a(?:[bcd])+$`, onePass, noStr},
- {`^a((?:[bcd])+)$`, onePass, noStr},
- {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`},
- {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`},
- {`^(?:(?:aa)|.)$`, NotOnePass, `a`},
- {`^(?:(?:a{1,2}){1,2})$`, NotOnePass, `aaaa`},
-}
-
-func TestCompileOnePass(t *testing.T) {
- var (
- p *Prog
- re *Regexp
- err error
- )
- for _, test := range onePassTests {
- if re, err = Parse(test.re, Perl); err != nil {
- t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
- continue
- }
- // needs to be done before compile...
- re = re.Simplify()
- if p, err = Compile(re); err != nil {
- t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
- continue
- }
- onePass = p.CompileOnePass()
- if (onePass == NotOnePass) != (test.onePass == NotOnePass) {
- t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass)
- }
- }
-}