From f36a13c620c829237cb6466cf1cb80819121a75d Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Mon, 14 Nov 2011 18:53:33 +0200 Subject: #1745761, #755670, #13357, #12629, #1200313: improve attribute handling in HTMLParser. --- Lib/html/parser.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'Lib/html/parser.py') diff --git a/Lib/html/parser.py b/Lib/html/parser.py index afdb305d08..662e85575a 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -30,8 +30,8 @@ attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( - r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name @@ -49,16 +49,16 @@ locatestarttagend = re.compile(r""" locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s* # optional whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator + (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value ) (?:\s*,)* # possibly followed by a comma - )? - ) - )* + )?\s* + )* + )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') @@ -295,6 +295,7 @@ class HTMLParser(_markupbase.ParserBase): elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] + if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() -- cgit v1.2.1