1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"os"
"testing"
)
type tokenTest struct {
// A short description of the test case.
desc string
// The HTML to parse.
html string
// The string representations of the expected tokens.
tokens []string
}
var tokenTests = []tokenTest{
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
"text",
"foo bar",
[]string{
"foo bar",
},
},
// An entity.
{
"entity",
"one < two",
[]string{
"one < two",
},
},
// A start, self-closing and end tag. The tokenizer does not care if the start
// and end tokens don't match; that is the job of the parser.
{
"tags",
"<a>b<c/>d</e>",
[]string{
"<a>",
"b",
"<c/>",
"d",
"</e>",
},
},
// An attribute with a backslash.
{
"backslash",
`<p id="a\"b">`,
[]string{
`<p id="a"b">`,
},
},
// Entities, tag name and attribute key lower-casing, and whitespace
// normalization within a tag.
{
"tricky",
"<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>",
[]string{
`<p id="a"B" foo="bar">`,
"<em>",
"te<&;xt",
"</em>",
"</p>",
},
},
// A non-existant entity. Tokenizing and converting back to a string should
// escape the "&" to become "&".
{
"noSuchEntity",
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
[]string{
`<a b="c&noSuchEntity;d">`,
"<&alsoDoesntExist;&",
},
},
}
func TestTokenizer(t *testing.T) {
loop:
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
for i, s := range tt.tokens {
if z.Next() == Error {
t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
continue loop
}
actual := z.Token().String()
if s != actual {
t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
continue loop
}
}
z.Next()
if z.Error() != os.EOF {
t.Errorf("%s: want EOF got %q", tt.desc, z.Token().String())
}
}
}
func TestUnescapeEscape(t *testing.T) {
ss := []string{
``,
`abc def`,
`a & b`,
`a&b`,
`a & b`,
`"`,
`"`,
`"<&>"`,
`"<&>"`,
`3&5==1 && 0<1, "0<1", a+acute=á`,
}
for _, s := range ss {
if s != UnescapeString(EscapeString(s)) {
t.Errorf("s != UnescapeString(EscapeString(s)), s=%q", s)
}
}
}
func TestBufAPI(t *testing.T) {
s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9"
z := NewTokenizer(bytes.NewBuffer([]byte(s)))
result := bytes.NewBuffer(nil)
depth := 0
loop:
for {
tt := z.Next()
switch tt {
case Error:
if z.Error() != os.EOF {
t.Error(z.Error())
}
break loop
case Text:
if depth > 0 {
result.Write(z.Text())
}
case StartTag, EndTag:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTag {
depth++
} else {
depth--
}
}
}
}
u := "14567"
v := string(result.Bytes())
if u != v {
t.Errorf("TestBufAPI: want %q got %q", u, v)
}
}
|