summaryrefslogtreecommitdiff
path: root/src/pkg/regexp/exec_test.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/pkg/regexp/exec_test.go')
-rw-r--r--src/pkg/regexp/exec_test.go715
1 files changed, 0 insertions, 715 deletions
diff --git a/src/pkg/regexp/exec_test.go b/src/pkg/regexp/exec_test.go
deleted file mode 100644
index 70d069c06..000000000
--- a/src/pkg/regexp/exec_test.go
+++ /dev/null
@@ -1,715 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package regexp
-
-import (
- "bufio"
- "compress/bzip2"
- "fmt"
- "io"
- "os"
- "path/filepath"
- "regexp/syntax"
- "strconv"
- "strings"
- "testing"
- "unicode/utf8"
-)
-
-// TestRE2 tests this package's regexp API against test cases
-// considered during RE2's exhaustive tests, which run all possible
-// regexps over a given set of atoms and operators, up to a given
-// complexity, over all possible strings over a given alphabet,
-// up to a given size. Rather than try to link with RE2, we read a
-// log file containing the test cases and the expected matches.
-// The log file, re2.txt, is generated by running 'make exhaustive-log'
-// in the open source RE2 distribution. http://code.google.com/p/re2/
-//
-// The test file format is a sequence of stanzas like:
-//
-// strings
-// "abc"
-// "123x"
-// regexps
-// "[a-z]+"
-// 0-3;0-3
-// -;-
-// "([0-9])([0-9])([0-9])"
-// -;-
-// -;0-3 0-1 1-2 2-3
-//
-// The stanza begins by defining a set of strings, quoted
-// using Go double-quote syntax, one per line. Then the
-// regexps section gives a sequence of regexps to run on
-// the strings. In the block that follows a regexp, each line
-// gives the semicolon-separated match results of running
-// the regexp on the corresponding string.
-// Each match result is either a single -, meaning no match, or a
-// space-separated sequence of pairs giving the match and
-// submatch indices. An unmatched subexpression formats
-// its pair as a single - (not illustrated above). For now
-// each regexp run produces two match results, one for a
-// ``full match'' that restricts the regexp to matching the entire
-// string or nothing, and one for a ``partial match'' that gives
-// the leftmost first match found in the string.
-//
-// Lines beginning with # are comments. Lines beginning with
-// a capital letter are test names printed during RE2's test suite
-// and are echoed into t but otherwise ignored.
-//
-// At time of writing, re2.txt is 32 MB but compresses to 760 kB,
-// so we store re2.txt.gz in the repository and decompress it on the fly.
-//
-func TestRE2Search(t *testing.T) {
- testRE2(t, "testdata/re2-search.txt")
-}
-
-func testRE2(t *testing.T, file string) {
- f, err := os.Open(file)
- if err != nil {
- t.Fatal(err)
- }
- defer f.Close()
- var txt io.Reader
- if strings.HasSuffix(file, ".bz2") {
- z := bzip2.NewReader(f)
- txt = z
- file = file[:len(file)-len(".bz2")] // for error messages
- } else {
- txt = f
- }
- lineno := 0
- scanner := bufio.NewScanner(txt)
- var (
- str []string
- input []string
- inStrings bool
- re *Regexp
- refull *Regexp
- nfail int
- ncase int
- )
- for lineno := 1; scanner.Scan(); lineno++ {
- line := scanner.Text()
- switch {
- case line == "":
- t.Fatalf("%s:%d: unexpected blank line", file, lineno)
- case line[0] == '#':
- continue
- case 'A' <= line[0] && line[0] <= 'Z':
- // Test name.
- t.Logf("%s\n", line)
- continue
- case line == "strings":
- str = str[:0]
- inStrings = true
- case line == "regexps":
- inStrings = false
- case line[0] == '"':
- q, err := strconv.Unquote(line)
- if err != nil {
- // Fatal because we'll get out of sync.
- t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err)
- }
- if inStrings {
- str = append(str, q)
- continue
- }
- // Is a regexp.
- if len(input) != 0 {
- t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q)
- }
- re, err = tryCompile(q)
- if err != nil {
- if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
- // We don't and likely never will support \C; keep going.
- continue
- }
- t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err)
- if nfail++; nfail >= 100 {
- t.Fatalf("stopping after %d errors", nfail)
- }
- continue
- }
- full := `\A(?:` + q + `)\z`
- refull, err = tryCompile(full)
- if err != nil {
- // Fatal because q worked, so this should always work.
- t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err)
- }
- input = str
- case line[0] == '-' || '0' <= line[0] && line[0] <= '9':
- // A sequence of match results.
- ncase++
- if re == nil {
- // Failed to compile: skip results.
- continue
- }
- if len(input) == 0 {
- t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno)
- }
- var text string
- text, input = input[0], input[1:]
- if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) {
- // RE2's \B considers every byte position,
- // so it sees 'not word boundary' in the
- // middle of UTF-8 sequences. This package
- // only considers the positions between runes,
- // so it disagrees. Skip those cases.
- continue
- }
- res := strings.Split(line, ";")
- if len(res) != len(run) {
- t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run))
- }
- for i := range res {
- have, suffix := run[i](re, refull, text)
- want := parseResult(t, file, lineno, res[i])
- if !same(have, want) {
- t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want)
- if nfail++; nfail >= 100 {
- t.Fatalf("stopping after %d errors", nfail)
- }
- continue
- }
- b, suffix := match[i](re, refull, text)
- if b != (want != nil) {
- t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b)
- if nfail++; nfail >= 100 {
- t.Fatalf("stopping after %d errors", nfail)
- }
- continue
- }
- }
-
- default:
- t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line)
- }
- }
- if err := scanner.Err(); err != nil {
- t.Fatalf("%s:%d: %v", file, lineno, err)
- }
- if len(input) != 0 {
- t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input))
- }
- t.Logf("%d cases tested", ncase)
-}
-
-var run = []func(*Regexp, *Regexp, string) ([]int, string){
- runFull,
- runPartial,
- runFullLongest,
- runPartialLongest,
-}
-
-func runFull(re, refull *Regexp, text string) ([]int, string) {
- refull.longest = false
- return refull.FindStringSubmatchIndex(text), "[full]"
-}
-
-func runPartial(re, refull *Regexp, text string) ([]int, string) {
- re.longest = false
- return re.FindStringSubmatchIndex(text), ""
-}
-
-func runFullLongest(re, refull *Regexp, text string) ([]int, string) {
- refull.longest = true
- return refull.FindStringSubmatchIndex(text), "[full,longest]"
-}
-
-func runPartialLongest(re, refull *Regexp, text string) ([]int, string) {
- re.longest = true
- return re.FindStringSubmatchIndex(text), "[longest]"
-}
-
-var match = []func(*Regexp, *Regexp, string) (bool, string){
- matchFull,
- matchPartial,
- matchFullLongest,
- matchPartialLongest,
-}
-
-func matchFull(re, refull *Regexp, text string) (bool, string) {
- refull.longest = false
- return refull.MatchString(text), "[full]"
-}
-
-func matchPartial(re, refull *Regexp, text string) (bool, string) {
- re.longest = false
- return re.MatchString(text), ""
-}
-
-func matchFullLongest(re, refull *Regexp, text string) (bool, string) {
- refull.longest = true
- return refull.MatchString(text), "[full,longest]"
-}
-
-func matchPartialLongest(re, refull *Regexp, text string) (bool, string) {
- re.longest = true
- return re.MatchString(text), "[longest]"
-}
-
-func isSingleBytes(s string) bool {
- for _, c := range s {
- if c >= utf8.RuneSelf {
- return false
- }
- }
- return true
-}
-
-func tryCompile(s string) (re *Regexp, err error) {
- // Protect against panic during Compile.
- defer func() {
- if r := recover(); r != nil {
- err = fmt.Errorf("panic: %v", r)
- }
- }()
- return Compile(s)
-}
-
-func parseResult(t *testing.T, file string, lineno int, res string) []int {
- // A single - indicates no match.
- if res == "-" {
- return nil
- }
- // Otherwise, a space-separated list of pairs.
- n := 1
- for j := 0; j < len(res); j++ {
- if res[j] == ' ' {
- n++
- }
- }
- out := make([]int, 2*n)
- i := 0
- n = 0
- for j := 0; j <= len(res); j++ {
- if j == len(res) || res[j] == ' ' {
- // Process a single pair. - means no submatch.
- pair := res[i:j]
- if pair == "-" {
- out[n] = -1
- out[n+1] = -1
- } else {
- k := strings.Index(pair, "-")
- if k < 0 {
- t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
- }
- lo, err1 := strconv.Atoi(pair[:k])
- hi, err2 := strconv.Atoi(pair[k+1:])
- if err1 != nil || err2 != nil || lo > hi {
- t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair)
- }
- out[n] = lo
- out[n+1] = hi
- }
- n += 2
- i = j + 1
- }
- }
- return out
-}
-
-func same(x, y []int) bool {
- if len(x) != len(y) {
- return false
- }
- for i, xi := range x {
- if xi != y[i] {
- return false
- }
- }
- return true
-}
-
-// TestFowler runs this package's regexp API against the
-// POSIX regular expression tests collected by Glenn Fowler
-// at http://www2.research.att.com/~gsf/testregex/.
-func TestFowler(t *testing.T) {
- files, err := filepath.Glob("testdata/*.dat")
- if err != nil {
- t.Fatal(err)
- }
- for _, file := range files {
- t.Log(file)
- testFowler(t, file)
- }
-}
-
-var notab = MustCompilePOSIX(`[^\t]+`)
-
-func testFowler(t *testing.T, file string) {
- f, err := os.Open(file)
- if err != nil {
- t.Error(err)
- return
- }
- defer f.Close()
- b := bufio.NewReader(f)
- lineno := 0
- lastRegexp := ""
-Reading:
- for {
- lineno++
- line, err := b.ReadString('\n')
- if err != nil {
- if err != io.EOF {
- t.Errorf("%s:%d: %v", file, lineno, err)
- }
- break Reading
- }
-
- // http://www2.research.att.com/~gsf/man/man1/testregex.html
- //
- // INPUT FORMAT
- // Input lines may be blank, a comment beginning with #, or a test
- // specification. A specification is five fields separated by one
- // or more tabs. NULL denotes the empty string and NIL denotes the
- // 0 pointer.
- if line[0] == '#' || line[0] == '\n' {
- continue Reading
- }
- line = line[:len(line)-1]
- field := notab.FindAllString(line, -1)
- for i, f := range field {
- if f == "NULL" {
- field[i] = ""
- }
- if f == "NIL" {
- t.Logf("%s:%d: skip: %s", file, lineno, line)
- continue Reading
- }
- }
- if len(field) == 0 {
- continue Reading
- }
-
- // Field 1: the regex(3) flags to apply, one character per REG_feature
- // flag. The test is skipped if REG_feature is not supported by the
- // implementation. If the first character is not [BEASKLP] then the
- // specification is a global control line. One or more of [BEASKLP] may be
- // specified; the test will be repeated for each mode.
- //
- // B basic BRE (grep, ed, sed)
- // E REG_EXTENDED ERE (egrep)
- // A REG_AUGMENTED ARE (egrep with negation)
- // S REG_SHELL SRE (sh glob)
- // K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
- // L REG_LITERAL LRE (fgrep)
- //
- // a REG_LEFT|REG_RIGHT implicit ^...$
- // b REG_NOTBOL lhs does not match ^
- // c REG_COMMENT ignore space and #...\n
- // d REG_SHELL_DOT explicit leading . match
- // e REG_NOTEOL rhs does not match $
- // f REG_MULTIPLE multiple \n separated patterns
- // g FNM_LEADING_DIR testfnmatch only -- match until /
- // h REG_MULTIREF multiple digit backref
- // i REG_ICASE ignore case
- // j REG_SPAN . matches \n
- // k REG_ESCAPE \ to ecape [...] delimiter
- // l REG_LEFT implicit ^...
- // m REG_MINIMAL minimal match
- // n REG_NEWLINE explicit \n match
- // o REG_ENCLOSED (|&) magic inside [@|&](...)
- // p REG_SHELL_PATH explicit / match
- // q REG_DELIMITED delimited pattern
- // r REG_RIGHT implicit ...$
- // s REG_SHELL_ESCAPED \ not special
- // t REG_MUSTDELIM all delimiters must be specified
- // u standard unspecified behavior -- errors not counted
- // v REG_CLASS_ESCAPE \ special inside [...]
- // w REG_NOSUB no subexpression match array
- // x REG_LENIENT let some errors slide
- // y REG_LEFT regexec() implicit ^...
- // z REG_NULL NULL subexpressions ok
- // $ expand C \c escapes in fields 2 and 3
- // / field 2 is a regsubcomp() expression
- // = field 3 is a regdecomp() expression
- //
- // Field 1 control lines:
- //
- // C set LC_COLLATE and LC_CTYPE to locale in field 2
- //
- // ?test ... output field 5 if passed and != EXPECTED, silent otherwise
- // &test ... output field 5 if current and previous passed
- // |test ... output field 5 if current passed and previous failed
- // ; ... output field 2 if previous failed
- // {test ... skip if failed until }
- // } end of skip
- //
- // : comment comment copied as output NOTE
- // :comment:test :comment: ignored
- // N[OTE] comment comment copied as output NOTE
- // T[EST] comment comment
- //
- // number use number for nmatch (20 by default)
- flag := field[0]
- switch flag[0] {
- case '?', '&', '|', ';', '{', '}':
- // Ignore all the control operators.
- // Just run everything.
- flag = flag[1:]
- if flag == "" {
- continue Reading
- }
- case ':':
- i := strings.Index(flag[1:], ":")
- if i < 0 {
- t.Logf("skip: %s", line)
- continue Reading
- }
- flag = flag[1+i+1:]
- case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
- t.Logf("skip: %s", line)
- continue Reading
- }
-
- // Can check field count now that we've handled the myriad comment formats.
- if len(field) < 4 {
- t.Errorf("%s:%d: too few fields: %s", file, lineno, line)
- continue Reading
- }
-
- // Expand C escapes (a.k.a. Go escapes).
- if strings.Contains(flag, "$") {
- f := `"` + field[1] + `"`
- if field[1], err = strconv.Unquote(f); err != nil {
- t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
- }
- f = `"` + field[2] + `"`
- if field[2], err = strconv.Unquote(f); err != nil {
- t.Errorf("%s:%d: cannot unquote %s", file, lineno, f)
- }
- }
-
- // Field 2: the regular expression pattern; SAME uses the pattern from
- // the previous specification.
- //
- if field[1] == "SAME" {
- field[1] = lastRegexp
- }
- lastRegexp = field[1]
-
- // Field 3: the string to match.
- text := field[2]
-
- // Field 4: the test outcome...
- ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3])
- if !ok {
- t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3])
- continue Reading
- }
-
- // Field 5: optional comment appended to the report.
-
- Testing:
- // Run test once for each specified capital letter mode that we support.
- for _, c := range flag {
- pattern := field[1]
- syn := syntax.POSIX | syntax.ClassNL
- switch c {
- default:
- continue Testing
- case 'E':
- // extended regexp (what we support)
- case 'L':
- // literal
- pattern = QuoteMeta(pattern)
- }
-
- for _, c := range flag {
- switch c {
- case 'i':
- syn |= syntax.FoldCase
- }
- }
-
- re, err := compile(pattern, syn, true)
- if err != nil {
- if shouldCompile {
- t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern)
- }
- continue Testing
- }
- if !shouldCompile {
- t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern)
- continue Testing
- }
- match := re.MatchString(text)
- if match != shouldMatch {
- t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch)
- continue Testing
- }
- have := re.FindStringSubmatchIndex(text)
- if (len(have) > 0) != match {
- t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have)
- continue Testing
- }
- if len(have) > len(pos) {
- have = have[:len(pos)]
- }
- if !same(have, pos) {
- t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos)
- }
- }
- }
-}
-
-func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) {
- // Field 4: the test outcome. This is either one of the posix error
- // codes (with REG_ omitted) or the match array, a list of (m,n)
- // entries with m and n being first and last+1 positions in the
- // field 3 string, or NULL if REG_NOSUB is in effect and success
- // is expected. BADPAT is acceptable in place of any regcomp(3)
- // error code. The match[] array is initialized to (-2,-2) before
- // each test. All array elements from 0 to nmatch-1 must be specified
- // in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
- // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
- // matched (?{...}) expression, where x is the text enclosed by {...},
- // o is the expression ordinal counting from 1, and n is the length of
- // the unmatched portion of the subject string. If x starts with a
- // number then that is the return value of re_execf(), otherwise 0 is
- // returned.
- switch {
- case s == "":
- // Match with no position information.
- ok = true
- compiled = true
- matched = true
- return
- case s == "NOMATCH":
- // Match failure.
- ok = true
- compiled = true
- matched = false
- return
- case 'A' <= s[0] && s[0] <= 'Z':
- // All the other error codes are compile errors.
- ok = true
- compiled = false
- return
- }
- compiled = true
-
- var x []int
- for s != "" {
- var end byte = ')'
- if len(x)%2 == 0 {
- if s[0] != '(' {
- ok = false
- return
- }
- s = s[1:]
- end = ','
- }
- i := 0
- for i < len(s) && s[i] != end {
- i++
- }
- if i == 0 || i == len(s) {
- ok = false
- return
- }
- var v = -1
- var err error
- if s[:i] != "?" {
- v, err = strconv.Atoi(s[:i])
- if err != nil {
- ok = false
- return
- }
- }
- x = append(x, v)
- s = s[i+1:]
- }
- if len(x)%2 != 0 {
- ok = false
- return
- }
- ok = true
- matched = true
- pos = x
- return
-}
-
-var text []byte
-
-func makeText(n int) []byte {
- if len(text) >= n {
- return text[:n]
- }
- text = make([]byte, n)
- x := ^uint32(0)
- for i := range text {
- x += x
- x ^= 1
- if int32(x) < 0 {
- x ^= 0x88888eef
- }
- if x%31 == 0 {
- text[i] = '\n'
- } else {
- text[i] = byte(x%(0x7E+1-0x20) + 0x20)
- }
- }
- return text
-}
-
-func benchmark(b *testing.B, re string, n int) {
- r := MustCompile(re)
- t := makeText(n)
- b.ResetTimer()
- b.SetBytes(int64(n))
- for i := 0; i < b.N; i++ {
- if r.Match(t) {
- b.Fatal("match!")
- }
- }
-}
-
-const (
- easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
- easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
- medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
- hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
- parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" +
- "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
-)
-
-func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) }
-func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) }
-func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) }
-func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) }
-func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) }
-func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) }
-func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) }
-func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) }
-func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) }
-func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) }
-func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 32<<0) }
-func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) }
-func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) }
-func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) }
-func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) }
-func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) }
-func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) }
-func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) }
-func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) }
-func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) }
-
-func TestLongest(t *testing.T) {
- re, err := Compile(`a(|b)`)
- if err != nil {
- t.Fatal(err)
- }
- if g, w := re.FindString("ab"), "a"; g != w {
- t.Errorf("first match was %q, want %q", g, w)
- }
- re.Longest()
- if g, w := re.FindString("ab"), "ab"; g != w {
- t.Errorf("longest match was %q, want %q", g, w)
- }
-}