strings: implement a faster single-string Replacer

The string searching is implemented separately so other functions may make use of it in the future. benchmark old ns/op new ns/op delta BenchmarkSingleMaxSkipping 125889 2474 -98.03% BenchmarkSingleLongSuffixFail 16252 1996 -87.72% BenchmarkSingleMatch 260793 136266 -47.75% benchmark old MB/s new MB/s speedup BenchmarkSingleMaxSkipping 79.43 4041.57 50.88x BenchmarkSingleLongSuffixFail 61.65 501.81 8.14x BenchmarkSingleMatch 57.52 110.08 1.91x R=nigeltao CC=golang-dev http://codereview.appspot.com/6545049 Committer: Nigel Tao <nigeltao@golang.org>
author: Eric Roshan-Eisner <eric.d.eisner@gmail.com> 2012-09-28 12:34:18 +1000
committer: Eric Roshan-Eisner <eric.d.eisner@gmail.com> 2012-09-28 12:34:18 +1000
commit: 02d709e3d3d73e7ccce1c2df482dc76e2aae2f4b (patch)
tree: 35cf0078c065f3566283006533c620bdc1580c89 /src/pkg/strings
parent: 5fc3807c1e3d76dae95393d13fe08d22f6ff92e2 (diff)
download: go-02d709e3d3d73e7ccce1c2df482dc76e2aae2f4b.tar.gz
5 files changed, 330 insertions, 8 deletions
diff --git a/src/pkg/strings/export_test.go b/src/pkg/strings/export_test.go
index 3a8c85140..17c806aa5 100644
--- a/src/pkg/strings/export_test.go
+++ b/src/pkg/strings/export_test.go
@@ -34,3 +34,12 @@ func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
 	}
 	return
 }
+
+func StringFind(pattern, text string) int {
+	return makeStringFinder(pattern).next(text)
+}
+
+func DumpTables(pattern string) ([]int, []int) {
+	finder := makeStringFinder(pattern)
+	return finder.badCharSkip[:], finder.goodSuffixSkip
+}
diff --git a/src/pkg/strings/replace.go b/src/pkg/strings/replace.go
index d863eeb5c..f63b1792c 100644
--- a/src/pkg/strings/replace.go
+++ b/src/pkg/strings/replace.go
@@ -33,6 +33,10 @@ func NewReplacer(oldnew ...string) *Replacer {
 		panic("strings.NewReplacer: odd argument count")
 	}
 
+	if len(oldnew) == 2 && len(oldnew[0]) > 1 {
+		return &Replacer{r: makeSingleStringReplacer(oldnew[0], oldnew[1])}
+	}
+
 	allNewBytes := true
 	for i := 0; i < len(oldnew); i += 2 {
 		if len(oldnew[i]) != 1 {
@@ -288,6 +292,10 @@ func (w *appendSliceWriter) WriteString(s string) (int, error) {
 	return len(s), nil
 }
 
+type stringWriterIface interface {
+	WriteString(string) (int, error)
+}
+
 type stringWriter struct {
 	w io.Writer
 }
@@ -296,6 +304,14 @@ func (w stringWriter) WriteString(s string) (int, error) {
 	return w.w.Write([]byte(s))
 }
 
+func getStringWriter(w io.Writer) stringWriterIface {
+	sw, ok := w.(stringWriterIface)
+	if !ok {
+		sw = stringWriter{w}
+	}
+	return sw
+}
+
 func (r *genericReplacer) Replace(s string) string {
 	buf := make(appendSliceWriter, 0, len(s))
 	r.WriteString(&buf, s)
@@ -303,13 +319,7 @@ func (r *genericReplacer) Replace(s string) string {
 }
 
 func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
-	sw, ok := w.(interface {
-		WriteString(string) (int, error)
-	})
-	if !ok {
-		sw = stringWriter{w}
-	}
-
+	sw := getStringWriter(w)
 	var last, wn int
 	var prevMatchEmpty bool
 	for i := 0; i <= len(s); {
@@ -340,6 +350,62 @@ func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error)
 	return
 }
 
+// singleStringReplacer is the implementation that's used when there is only
+// one string to replace (and that string has more than one byte).
+type singleStringReplacer struct {
+	finder *stringFinder
+	// value is the new string that replaces that pattern when it's found.
+	value string
+}
+
+func makeSingleStringReplacer(pattern string, value string) *singleStringReplacer {
+	return &singleStringReplacer{finder: makeStringFinder(pattern), value: value}
+}
+
+func (r *singleStringReplacer) Replace(s string) string {
+	var buf []byte
+	i := 0
+	for {
+		match := r.finder.next(s[i:])
+		if match == -1 {
+			break
+		}
+		buf = append(buf, s[i:i+match]...)
+		buf = append(buf, r.value...)
+		i += match + len(r.finder.pattern)
+	}
+	if buf == nil {
+		return s
+	}
+	buf = append(buf, s[i:]...)
+	return string(buf)
+}
+
+func (r *singleStringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
+	sw := getStringWriter(w)
+	var i, wn int
+	for {
+		match := r.finder.next(s[i:])
+		if match == -1 {
+			break
+		}
+		wn, err = sw.WriteString(s[i : i+match])
+		n += wn
+		if err != nil {
+			return
+		}
+		wn, err = sw.WriteString(r.value)
+		n += wn
+		if err != nil {
+			return
+		}
+		i += match + len(r.finder.pattern)
+	}
+	wn, err = sw.WriteString(s[i:])
+	n += wn
+	return
+}
+
 // byteReplacer is the implementation that's used when all the "old"
 // and "new" values are single ASCII bytes.
 type byteReplacer struct {
diff --git a/src/pkg/strings/replace_test.go b/src/pkg/strings/replace_test.go
index edc990c18..d33dea95b 100644
--- a/src/pkg/strings/replace_test.go
+++ b/src/pkg/strings/replace_test.go
@@ -254,6 +254,17 @@ func TestReplacer(t *testing.T) {
 		testCase{blankFoo, "", "X"},
 	)
 
+	// single string replacer
+
+	abcMatcher := NewReplacer("abc", "[match]")
+
+	testCases = append(testCases,
+		testCase{abcMatcher, "", ""},
+		testCase{abcMatcher, "ab", "ab"},
+		testCase{abcMatcher, "abcd", "[match]d"},
+		testCase{abcMatcher, "cabcabcdabca", "c[match][match]d[match]a"},
+	)
+
 	// No-arg test cases.
 
 	nop := NewReplacer()
@@ -294,8 +305,9 @@ func TestPickAlgorithm(t *testing.T) {
 	}{
 		{capitalLetters, "*strings.byteReplacer"},
 		{htmlEscaper, "*strings.byteStringReplacer"},
-		{NewReplacer("12", "123"), "*strings.genericReplacer"},
+		{NewReplacer("12", "123"), "*strings.singleStringReplacer"},
 		{NewReplacer("1", "12"), "*strings.byteStringReplacer"},
+		{NewReplacer("", "X"), "*strings.genericReplacer"},
 		{NewReplacer("a", "1", "b", "12", "cde", "123"), "*strings.genericReplacer"},
 	}
 	for i, tc := range testCases {
@@ -401,6 +413,27 @@ func BenchmarkGenericMatch2(b *testing.B) {
 	}
 }
 
+func benchmarkSingleString(b *testing.B, pattern, text string) {
+	r := NewReplacer(pattern, "[match]")
+	b.SetBytes(int64(len(text)))
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		r.Replace(text)
+	}
+}
+
+func BenchmarkSingleMaxSkipping(b *testing.B) {
+	benchmarkSingleString(b, Repeat("b", 25), Repeat("a", 10000))
+}
+
+func BenchmarkSingleLongSuffixFail(b *testing.B) {
+	benchmarkSingleString(b, "b"+Repeat("a", 500), Repeat("a", 1002))
+}
+
+func BenchmarkSingleMatch(b *testing.B) {
+	benchmarkSingleString(b, "abcdef", Repeat("abcdefghijklmno", 1000))
+}
+
 func BenchmarkByteByteNoMatch(b *testing.B) {
 	str := Repeat("A", 100) + Repeat("B", 100)
 	for i := 0; i < b.N; i++ {
diff --git a/src/pkg/strings/search.go b/src/pkg/strings/search.go
new file mode 100644
index 000000000..f77c879c5
--- /dev/null
+++ b/src/pkg/strings/search.go
@@ -0,0 +1,124 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+// stringFinder efficiently finds strings in a source text. It's implemented
+// using the Boyer-Moore string search algorithm:
+// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
+// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
+// document uses 1-based indexing)
+type stringFinder struct {
+	// pattern is the string that we are searching for in the text.
+	pattern string
+
+	// badCharSkip[b] contains the distance between the last byte of pattern
+	// and the rightmost occurrence of b in pattern. If b is not in pattern,
+	// badCharSkip[b] is len(pattern).
+	//
+	// Whenever a mismatch is found with byte b in the text, we can safely
+	// shift the matching frame at least badCharSkip[b] until the next time
+	// the matching char could be in alignment.
+	badCharSkip [256]int
+
+	// goodSuffixSkip[i] defines how far we can shift the matching frame given
+	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
+	// not. There are two cases to consider:
+	//
+	// 1. The matched suffix occurs elsewhere in pattern (with a different
+	// byte preceding it that we might possibly match). In this case, we can
+	// shift the matching frame to align with the next suffix chunk. For
+	// example, the pattern "mississi" has the suffix "issi" next occurring
+	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
+	// shift+len(suffix) == 3+4 == 7.
+	//
+	// 2. If the matched suffix does not occur elsewhere in pattern, then the
+	// matching frame may share part of its prefix with the end of the
+	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
+	// to shift the frame to align this portion of the prefix to the
+	// suffix. For example, in the pattern "abcxxxabc", when the first
+	// mismatch from the back is found to be in position 3, the matching
+	// suffix "xxabc" is not found elsewhere in the pattern. However, its
+	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
+	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
+	goodSuffixSkip []int
+}
+
+func makeStringFinder(pattern string) *stringFinder {
+	f := &stringFinder{
+		pattern:        pattern,
+		goodSuffixSkip: make([]int, len(pattern)),
+	}
+	// last is the index of the last character in the pattern.
+	last := len(pattern) - 1
+
+	// Build bad character table.
+	// Bytes not in the pattern can skip one pattern's length.
+	for i := range f.badCharSkip {
+		f.badCharSkip[i] = len(pattern)
+	}
+	// The loop condition is < instead of <= so that the last byte does not
+	// have a zero distance to itself. Finding this byte out of place implies
+	// that it is not in the last position.
+	for i := 0; i < last; i++ {
+		f.badCharSkip[pattern[i]] = last - i
+	}
+
+	// Build good suffix table.
+	// First pass: set each value to the next index which starts a prefix of
+	// pattern.
+	lastPrefix := last
+	for i := last; i >= 0; i-- {
+		if HasPrefix(pattern, pattern[i+1:]) {
+			lastPrefix = i + 1
+		}
+		// lastPrefix is the shift, and (last-i) is len(suffix).
+		f.goodSuffixSkip[i] = lastPrefix + last - i
+	}
+	// Second pass: find repeats of pattern's suffix starting from the front.
+	for i := 0; i < last; i++ {
+		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
+		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
+			// (last-i) is the shift, and lenSuffix is len(suffix).
+			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
+		}
+	}
+
+	return f
+}
+
+func longestCommonSuffix(a, b string) (i int) {
+	for ; i < len(a) && i < len(b); i++ {
+		if a[len(a)-1-i] != b[len(b)-1-i] {
+			break
+		}
+	}
+	return
+}
+
+// next returns the index in text of the first occurrence of the pattern. If
+// the pattern is not found, it returns -1.
+func (f *stringFinder) next(text string) int {
+	i := len(f.pattern) - 1
+	for i < len(text) {
+		// Compare backwards from the end until the first unmatching character.
+		j := len(f.pattern) - 1
+		for j >= 0 && text[i] == f.pattern[j] {
+			i--
+			j--
+		}
+		if j < 0 {
+			return i + 1 // match
+		}
+		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
+	}
+	return -1
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
diff --git a/src/pkg/strings/search_test.go b/src/pkg/strings/search_test.go
new file mode 100644
index 000000000..966c05e65
--- /dev/null
+++ b/src/pkg/strings/search_test.go
@@ -0,0 +1,90 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings_test
+
+import (
+	"reflect"
+	. "strings"
+	"testing"
+)
+
+func TestFinderNext(t *testing.T) {
+	testCases := []struct {
+		pat, text string
+		index     int
+	}{
+		{"", "", 0},
+		{"", "abc", 0},
+		{"abc", "", -1},
+		{"abc", "abc", 0},
+		{"d", "abcdefg", 3},
+		{"nan", "banana", 2},
+		{"pan", "anpanman", 2},
+		{"nnaaman", "anpanmanam", -1},
+		{"abcd", "abc", -1},
+		{"abcd", "bcd", -1},
+		{"bcd", "abcd", 1},
+		{"abc", "acca", -1},
+		{"aa", "aaa", 0},
+		{"baa", "aaaaa", -1},
+		{"at that", "which finally halts.  at that point", 22},
+	}
+
+	for _, tc := range testCases {
+		got := StringFind(tc.pat, tc.text)
+		want := tc.index
+		if got != want {
+			t.Errorf("stringFind(%q, %q) got %d, want %d\n", tc.pat, tc.text, got, want)
+		}
+	}
+}
+
+func TestFinderCreation(t *testing.T) {
+	testCases := []struct {
+		pattern string
+		bad     [256]int
+		suf     []int
+	}{
+		{
+			"abc",
+			[256]int{'a': 2, 'b': 1, 'c': 3},
+			[]int{5, 4, 1},
+		},
+		{
+			"mississi",
+			[256]int{'i': 3, 'm': 7, 's': 1},
+			[]int{15, 14, 13, 7, 11, 10, 7, 1},
+		},
+		// From http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf
+		{
+			"abcxxxabc",
+			[256]int{'a': 2, 'b': 1, 'c': 6, 'x': 3},
+			[]int{14, 13, 12, 11, 10, 9, 11, 10, 1},
+		},
+		{
+			"abyxcdeyx",
+			[256]int{'a': 8, 'b': 7, 'c': 4, 'd': 3, 'e': 2, 'y': 1, 'x': 5},
+			[]int{17, 16, 15, 14, 13, 12, 7, 10, 1},
+		},
+	}
+
+	for _, tc := range testCases {
+		bad, good := DumpTables(tc.pattern)
+
+		for i, got := range bad {
+			want := tc.bad[i]
+			if want == 0 {
+				want = len(tc.pattern)
+			}
+			if got != want {
+				t.Errorf("boyerMoore(%q) bad['%c']: got %d want %d", tc.pattern, i, got, want)
+			}
+		}
+
+		if !reflect.DeepEqual(good, tc.suf) {
+			t.Errorf("boyerMoore(%q) got %v want %v", tc.pattern, good, tc.suf)
+		}
+	}
+}
author	Eric Roshan-Eisner <eric.d.eisner@gmail.com>	2012-09-28 12:34:18 +1000
committer	Eric Roshan-Eisner <eric.d.eisner@gmail.com>	2012-09-28 12:34:18 +1000
commit	02d709e3d3d73e7ccce1c2df482dc76e2aae2f4b (patch)
tree	35cf0078c065f3566283006533c620bdc1580c89 /src/pkg/strings
parent	5fc3807c1e3d76dae95393d13fe08d22f6ff92e2 (diff)
download	go-02d709e3d3d73e7ccce1c2df482dc76e2aae2f4b.tar.gz