diff options
Diffstat (limited to 'libgo/go/exp/html/token.go')
-rw-r--r-- | libgo/go/exp/html/token.go | 534 |
1 files changed, 464 insertions, 70 deletions
diff --git a/libgo/go/exp/html/token.go b/libgo/go/exp/html/token.go index b5e9c2d6ea3..517bd5d3ee8 100644 --- a/libgo/go/exp/html/token.go +++ b/libgo/go/exp/html/token.go @@ -6,13 +6,14 @@ package html import ( "bytes" + "exp/html/atom" "io" "strconv" "strings" ) // A TokenType is the type of a Token. -type TokenType int +type TokenType uint32 const ( // ErrorToken means that an error occurred during tokenization. @@ -65,11 +66,13 @@ type Attribute struct { // A Token consists of a TokenType and some Data (tag name for start and end // tags, content for text, comments and doctypes). A tag Token may also contain // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" -// rather than "a<b"). +// rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or +// zero if Data is not a known tag name. type Token struct { - Type TokenType - Data string - Attr []Attribute + Type TokenType + DataAtom atom.Atom + Data string + Attr []Attribute } // tagString returns a string representation of a tag Token's Data and Attr. @@ -149,6 +152,57 @@ type Tokenizer struct { rawTag string // textIsRaw is whether the current text token's data is not escaped. textIsRaw bool + // convertNUL is whether NUL bytes in the current token's data should + // be converted into \ufffd replacement characters. + convertNUL bool + // allowCDATA is whether CDATA sections are allowed in the current context. + allowCDATA bool +} + +// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as +// the text "foo". The default value is false, which means to recognize it as +// a bogus comment "<!-- [CDATA[foo]] -->" instead. +// +// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and +// only if tokenizing foreign content, such as MathML and SVG. However, +// tracking foreign-contentness is difficult to do purely in the tokenizer, +// as opposed to the parser, due to HTML integration points: an <svg> element +// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to- +// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call AllowCDATA as appropriate. +// In practice, if using the tokenizer without caring whether MathML or SVG +// CDATA is text or comments, such as tokenizing HTML to find all the anchor +// text, it is acceptable to ignore this responsibility. +func (z *Tokenizer) AllowCDATA(allowCDATA bool) { + z.allowCDATA = allowCDATA +} + +// NextIsNotRawText instructs the tokenizer that the next token should not be +// considered as 'raw text'. Some elements, such as script and title elements, +// normally require the next token after the opening tag to be 'raw text' that +// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>" +// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and +// an end tag token for "</title>". There are no distinct start tag or end tag +// tokens for the "<b>" and "</b>". +// +// This tokenizer implementation will generally look for raw text at the right +// times. Strictly speaking, an HTML5 compliant tokenizer should not look for +// raw text if in foreign content: <title> generally needs raw text, but a +// <title> inside an <svg> does not. Another example is that a <textarea> +// generally needs raw text, but a <textarea> is not allowed as an immediate +// child of a <select>; in normal parsing, a <textarea> implies </select>, but +// one cannot close the implicit element when parsing a <select>'s InnerHTML. +// Similarly to AllowCDATA, tracking the correct moment to override raw-text- +// ness is difficult to do purely in the tokenizer, as opposed to the parser. +// For strict compliance with the HTML5 tokenization algorithm, it is the +// responsibility of the user of a tokenizer to call NextIsNotRawText as +// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this +// responsibility for basic usage. +// +// Note that this 'raw text' concept is different from the one offered by the +// Tokenizer.Raw method. +func (z *Tokenizer) NextIsNotRawText() { + z.rawTag = "" } // Err returns the error associated with the most recent ErrorToken token. @@ -233,6 +287,12 @@ func (z *Tokenizer) skipWhiteSpace() { // readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and // is typically something like "script" or "textarea". func (z *Tokenizer) readRawOrRCDATA() { + if z.rawTag == "script" { + z.readScript() + z.textIsRaw = true + z.rawTag = "" + return + } loop: for { c := z.readByte() @@ -249,27 +309,8 @@ loop: if c != '/' { continue loop } - for i := 0; i < len(z.rawTag); i++ { - c = z.readByte() - if z.err != nil { - break loop - } - if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { - continue loop - } - } - c = z.readByte() - if z.err != nil { - break loop - } - switch c { - case ' ', '\n', '\r', '\t', '\f', '/', '>': - // The 3 is 2 for the leading "</" plus 1 for the trailing character c. - z.raw.end -= 3 + len(z.rawTag) + if z.readRawEndTag() || z.err != nil { break loop - case '<': - // Step back one, to catch "</foo</foo>". - z.raw.end-- } } z.data.end = z.raw.end @@ -278,6 +319,242 @@ loop: z.rawTag = "" } +// readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag. +// If it succeeds, it backs up the input position to reconsume the tag and +// returns true. Otherwise it returns false. The opening "</" has already been +// consumed. +func (z *Tokenizer) readRawEndTag() bool { + for i := 0; i < len(z.rawTag); i++ { + c := z.readByte() + if z.err != nil { + return false + } + if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { + z.raw.end-- + return false + } + } + c := z.readByte() + if z.err != nil { + return false + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + // The 3 is 2 for the leading "</" plus 1 for the trailing character c. + z.raw.end -= 3 + len(z.rawTag) + return true + } + z.raw.end-- + return false +} + +// readScript reads until the next </script> tag, following the byzantine +// rules for escaping/hiding the closing tag. +func (z *Tokenizer) readScript() { + defer func() { + z.data.end = z.raw.end + }() + var c byte + +scriptData: + c = z.readByte() + if z.err != nil { + return + } + if c == '<' { + goto scriptDataLessThanSign + } + goto scriptData + +scriptDataLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '/': + goto scriptDataEndTagOpen + case '!': + goto scriptDataEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptData + +scriptDataEscapeStart: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapeStartDash + } + z.raw.end-- + goto scriptData + +scriptDataEscapeStartDash: + c = z.readByte() + if z.err != nil { + return + } + if c == '-' { + goto scriptDataEscapedDashDash + } + z.raw.end-- + goto scriptData + +scriptDataEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + } + goto scriptDataEscaped + +scriptDataEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataEscapedDashDash + case '<': + goto scriptDataEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataEscaped + +scriptDataEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataEscapedEndTagOpen + } + if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + goto scriptDataDoubleEscapeStart + } + z.raw.end-- + goto scriptData + +scriptDataEscapedEndTagOpen: + if z.readRawEndTag() || z.err != nil { + return + } + goto scriptDataEscaped + +scriptDataDoubleEscapeStart: + z.raw.end-- + for i := 0; i < len("script"); i++ { + c = z.readByte() + if z.err != nil { + return + } + if c != "script"[i] && c != "SCRIPT"[i] { + z.raw.end-- + goto scriptDataEscaped + } + } + c = z.readByte() + if z.err != nil { + return + } + switch c { + case ' ', '\n', '\r', '\t', '\f', '/', '>': + goto scriptDataDoubleEscaped + } + z.raw.end-- + goto scriptDataEscaped + +scriptDataDoubleEscaped: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedDashDash: + c = z.readByte() + if z.err != nil { + return + } + switch c { + case '-': + goto scriptDataDoubleEscapedDashDash + case '<': + goto scriptDataDoubleEscapedLessThanSign + case '>': + goto scriptData + } + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapedLessThanSign: + c = z.readByte() + if z.err != nil { + return + } + if c == '/' { + goto scriptDataDoubleEscapeEnd + } + z.raw.end-- + goto scriptDataDoubleEscaped + +scriptDataDoubleEscapeEnd: + if z.readRawEndTag() { + z.raw.end += len("</script>") + goto scriptDataEscaped + } + if z.err != nil { + return + } + goto scriptDataDoubleEscaped +} + // readComment reads the next comment token starting with "<!--". The opening // "<!--" has already been consumed. func (z *Tokenizer) readComment() { @@ -341,8 +618,8 @@ func (z *Tokenizer) readUntilCloseAngle() { } // readMarkupDeclaration reads the next token starting with "<!". It might be -// a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening -// "<!" has already been consumed. +// a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or +// "<!a bogus comment". The opening "<!" has already been consumed. func (z *Tokenizer) readMarkupDeclaration() TokenType { z.data.start = z.raw.end var c [2]byte @@ -358,27 +635,81 @@ func (z *Tokenizer) readMarkupDeclaration() TokenType { return CommentToken } z.raw.end -= 2 + if z.readDoctype() { + return DoctypeToken + } + if z.allowCDATA && z.readCDATA() { + z.convertNUL = true + return TextToken + } + // It's a bogus comment. + z.readUntilCloseAngle() + return CommentToken +} + +// readDoctype attempts to read a doctype declaration and returns true if +// successful. The opening "<!" has already been consumed. +func (z *Tokenizer) readDoctype() bool { const s = "DOCTYPE" for i := 0; i < len(s); i++ { c := z.readByte() if z.err != nil { z.data.end = z.raw.end - return CommentToken + return false } if c != s[i] && c != s[i]+('a'-'A') { // Back up to read the fragment of "DOCTYPE" again. z.raw.end = z.data.start - z.readUntilCloseAngle() - return CommentToken + return false } } if z.skipWhiteSpace(); z.err != nil { z.data.start = z.raw.end z.data.end = z.raw.end - return DoctypeToken + return true } z.readUntilCloseAngle() - return DoctypeToken + return true +} + +// readCDATA attempts to read a CDATA section and returns true if +// successful. The opening "<!" has already been consumed. +func (z *Tokenizer) readCDATA() bool { + const s = "[CDATA[" + for i := 0; i < len(s); i++ { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return false + } + if c != s[i] { + // Back up to read the fragment of "[CDATA[" again. + z.raw.end = z.data.start + return false + } + } + z.data.start = z.raw.end + brackets := 0 + for { + c := z.readByte() + if z.err != nil { + z.data.end = z.raw.end + return true + } + switch c { + case ']': + brackets++ + case '>': + if brackets >= 2 { + z.data.end = z.raw.end - len("]]>") + return true + } + brackets = 0 + default: + brackets = 0 + } + } + panic("unreachable") } // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] @@ -406,29 +737,10 @@ loop: // readStartTag reads the next start tag token. The opening "<a" has already // been consumed, where 'a' means anything in [A-Za-z]. func (z *Tokenizer) readStartTag() TokenType { - z.attr = z.attr[:0] - z.nAttrReturned = 0 - // Read the tag name and attribute key/value pairs. - z.readTagName() - if z.skipWhiteSpace(); z.err != nil { + z.readTag(true) + if z.err != nil { return ErrorToken } - for { - c := z.readByte() - if z.err != nil || c == '>' { - break - } - z.raw.end-- - z.readTagAttrKey() - z.readTagAttrVal() - // Save pendingAttr if it has a non-empty key. - if z.pendingAttr[0].start != z.pendingAttr[0].end { - z.attr = append(z.attr, z.pendingAttr) - } - if z.skipWhiteSpace(); z.err != nil { - break - } - } // Several tags flag the tokenizer's next token as raw. c, raw := z.buf[z.data.start], false if 'A' <= c && c <= 'Z' { @@ -458,16 +770,32 @@ func (z *Tokenizer) readStartTag() TokenType { return StartTagToken } -// readEndTag reads the next end tag token. The opening "</a" has already -// been consumed, where 'a' means anything in [A-Za-z]. -func (z *Tokenizer) readEndTag() { +// readTag reads the next tag token and its attributes. If saveAttr, those +// attributes are saved in z.attr, otherwise z.attr is set to an empty slice. +// The opening "<a" or "</a" has already been consumed, where 'a' means anything +// in [A-Za-z]. +func (z *Tokenizer) readTag(saveAttr bool) { z.attr = z.attr[:0] z.nAttrReturned = 0 + // Read the tag name and attribute key/value pairs. z.readTagName() + if z.skipWhiteSpace(); z.err != nil { + return + } for { c := z.readByte() if z.err != nil || c == '>' { - return + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if saveAttr and that attribute has a non-empty key. + if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) + } + if z.skipWhiteSpace(); z.err != nil { + break } } } @@ -594,16 +922,19 @@ func (z *Tokenizer) Next() TokenType { for z.err == nil { z.readByte() } + z.data.end = z.raw.end z.textIsRaw = true } else { z.readRawOrRCDATA() } if z.data.end > z.data.start { z.tt = TextToken + z.convertNUL = true return z.tt } } z.textIsRaw = false + z.convertNUL = false loop: for { @@ -662,8 +993,12 @@ loop: continue loop } if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { - z.readEndTag() - z.tt = EndTagToken + z.readTag(false) + if z.err != nil { + z.tt = ErrorToken + } else { + z.tt = EndTagToken + } return z.tt } z.raw.end-- @@ -696,6 +1031,43 @@ func (z *Tokenizer) Raw() []byte { return z.buf[z.raw.start:z.raw.end] } +// convertNewlines converts "\r" and "\r\n" in s to "\n". +// The conversion happens in place, but the resulting slice may be shorter. +func convertNewlines(s []byte) []byte { + for i, c := range s { + if c != '\r' { + continue + } + + src := i + 1 + if src >= len(s) || s[src] != '\n' { + s[i] = '\n' + continue + } + + dst := i + for src < len(s) { + if s[src] == '\r' { + if src+1 < len(s) && s[src+1] == '\n' { + src++ + } + s[dst] = '\n' + } else { + s[dst] = s[src] + } + src++ + dst++ + } + return s[:dst] + } + return s +} + +var ( + nul = []byte("\x00") + replacement = []byte("\ufffd") +) + // Text returns the unescaped text of a text, comment or doctype token. The // contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { @@ -704,8 +1076,12 @@ func (z *Tokenizer) Text() []byte { s := z.buf[z.data.start:z.data.end] z.data.start = z.raw.end z.data.end = z.raw.end + s = convertNewlines(s) + if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { + s = bytes.Replace(s, nul, replacement, -1) + } if !z.textIsRaw { - s = unescape(s) + s = unescape(s, false) } return s } @@ -739,7 +1115,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { z.nAttrReturned++ key = z.buf[x[0].start:x[0].end] val = z.buf[x[1].start:x[1].end] - return lower(key), unescape(val), z.nAttrReturned < len(z.attr) + return lower(key), unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) } } return nil, nil, false @@ -752,19 +1128,18 @@ func (z *Tokenizer) Token() Token { switch z.tt { case TextToken, CommentToken, DoctypeToken: t.Data = string(z.Text()) - case StartTagToken, SelfClosingTagToken: - var attr []Attribute + case StartTagToken, SelfClosingTagToken, EndTagToken: name, moreAttr := z.TagName() for moreAttr { var key, val []byte key, val, moreAttr = z.TagAttr() - attr = append(attr, Attribute{"", string(key), string(val)}) + t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) + } + if a := atom.Lookup(name); a != 0 { + t.DataAtom, t.Data = a, a.String() + } else { + t.DataAtom, t.Data = 0, string(name) } - t.Data = string(name) - t.Attr = attr - case EndTagToken: - name, _ := z.TagName() - t.Data = string(name) } return t } @@ -772,8 +1147,27 @@ func (z *Tokenizer) Token() Token { // NewTokenizer returns a new HTML Tokenizer for the given Reader. // The input is assumed to be UTF-8 encoded. func NewTokenizer(r io.Reader) *Tokenizer { - return &Tokenizer{ + return NewTokenizerFragment(r, "") +} + +// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for +// tokenizing an exisitng element's InnerHTML fragment. contextTag is that +// element's tag, such as "div" or "iframe". +// +// For example, how the InnerHTML "a<b" is tokenized depends on whether it is +// for a <p> tag or a <script> tag. +// +// The input is assumed to be UTF-8 encoded. +func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer { + z := &Tokenizer{ r: r, buf: make([]byte, 0, 4096), } + if contextTag != "" { + switch s := strings.ToLower(contextTag); s { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": + z.rawTag = s + } + } + return z } |