diff options
author | Vlastimil Zíma <vlastimil.zima@nic.cz> | 2018-05-02 17:38:26 +0200 |
---|---|---|
committer | Vlastimil Zíma <vlastimil.zima@nic.cz> | 2018-05-02 17:38:26 +0200 |
commit | cf7908f1d8bfcb305dd547baf3d944807e0936a5 (patch) | |
tree | e965fed6f884ddcee0d2815f445852021577eb4f /openid | |
parent | 44293807fecb3eedf26a526dbb5961f1efa12642 (diff) | |
download | openid-cf7908f1d8bfcb305dd547baf3d944807e0936a5.tar.gz |
Refactor consumer HTML parse
Diffstat (limited to 'openid')
-rw-r--r-- | openid/consumer/discover.py | 30 | ||||
-rw-r--r-- | openid/consumer/html_parse.py | 263 | ||||
-rw-r--r-- | openid/test/linkparse.txt | 584 | ||||
-rw-r--r-- | openid/test/test_htmldiscover.py | 21 | ||||
-rw-r--r-- | openid/test/test_linkparse.py | 88 |
5 files changed, 33 insertions, 953 deletions
diff --git a/openid/consumer/discover.py b/openid/consumer/discover.py index b9bc30e..0824af4 100644 --- a/openid/consumer/discover.py +++ b/openid/consumer/discover.py @@ -16,8 +16,10 @@ __all__ = [ import logging import urlparse +from lxml.etree import LxmlError +from lxml.html import document_fromstring + from openid import fetchers, urinorm -from openid.consumer import html_parse from openid.message import OPENID1_NS as OPENID_1_0_MESSAGE_NS, OPENID2_NS as OPENID_2_0_MESSAGE_NS from openid.yadis import filters, xri, xrires from openid.yadis.discover import DiscoveryFailure, discover as yadisDiscover @@ -32,6 +34,8 @@ OPENID_2_0_TYPE = 'http://specs.openid.net/auth/2.0/signon' OPENID_1_1_TYPE = 'http://openid.net/signon/1.1' OPENID_1_0_TYPE = 'http://openid.net/signon/1.0' +LINK_REL_XPATH = "/html/head/link[contains(concat(' ', normalize-space(@rel), ' '), ' {} ')]" + class OpenIDServiceEndpoint(object): """Object representing an OpenID service endpoint. @@ -152,19 +156,29 @@ class OpenIDServiceEndpoint(object): (OPENID_2_0_TYPE, 'openid2.provider', 'openid2.local_id'), (OPENID_1_1_TYPE, 'openid.server', 'openid.delegate'), ] - - link_attrs = html_parse.parseLinkAttrs(html) services = [] + + try: + parsed_html = document_fromstring(html) + except LxmlError: + # It's a dumb function. Return empty results in case of an error. + return [] for type_uri, op_endpoint_rel, local_id_rel in discovery_types: - op_endpoint_url = html_parse.findFirstHref( - link_attrs, op_endpoint_rel) - if op_endpoint_url is None: + op_links = parsed_html.xpath(LINK_REL_XPATH.format(op_endpoint_rel)) + if not op_links: + continue + op_endpoint_url = op_links[0].get('href') + if not op_endpoint_url: continue + local_id_links = parsed_html.xpath(LINK_REL_XPATH.format(local_id_rel)) + local_id = None + if local_id_links: + local_id = local_id_links[0].get('href') + service = cls() service.claimed_id = uri - service.local_id = html_parse.findFirstHref( - link_attrs, local_id_rel) + service.local_id = local_id service.server_url = op_endpoint_url service.type_uris = [type_uri] diff --git a/openid/consumer/html_parse.py b/openid/consumer/html_parse.py deleted file mode 100644 index 3c2a025..0000000 --- a/openid/consumer/html_parse.py +++ /dev/null @@ -1,263 +0,0 @@ -""" -This module implements a VERY limited parser that finds <link> tags in -the head of HTML or XHTML documents and parses out their attributes -according to the OpenID spec. It is a liberal parser, but it requires -these things from the data in order to work: - - - There must be an open <html> tag - - - There must be an open <head> tag inside of the <html> tag - - - Only <link>s that are found inside of the <head> tag are parsed - (this is by design) - - - The parser follows the OpenID specification in resolving the - attributes of the link tags. This means that the attributes DO NOT - get resolved as they would by an XML or HTML parser. In particular, - only certain entities get replaced, and href attributes do not get - resolved relative to a base URL. - -From http://openid.net/specs.bml#linkrel: - - - The openid.server URL MUST be an absolute URL. OpenID consumers - MUST NOT attempt to resolve relative URLs. - - - The openid.server URL MUST NOT include entities other than &, - <, >, and ". - -The parser ignores SGML comments and <![CDATA[blocks]]>. Both kinds of -quoting are allowed for attributes. - -The parser deals with invalid markup in these ways: - - - Tag names are not case-sensitive - - - The <html> tag is accepted even when it is not at the top level - - - The <head> tag is accepted even when it is not a direct child of - the <html> tag, but a <html> tag must be an ancestor of the <head> - tag - - - <link> tags are accepted even when they are not direct children of - the <head> tag, but a <head> tag must be an ancestor of the <link> - tag - - - If there is no closing tag for an open <html> or <head> tag, the - remainder of the document is viewed as being inside of the tag. If - there is no closing tag for a <link> tag, the link tag is treated - as a short tag. Exceptions to this rule are that <html> closes - <html> and <body> or <head> closes <head> - - - Attributes of the <link> tag are not required to be quoted. - - - In the case of duplicated attribute names, the attribute coming - last in the tag will be the value returned. - - - Any text that does not parse as an attribute within a link tag will - be ignored. (e.g. <link pumpkin rel='openid.server' /> will ignore - pumpkin) - - - If there are more than one <html> or <head> tag, the parser only - looks inside of the first one. - - - The contents of <script> tags are ignored entirely, except unclosed - <script> tags. Unclosed <script> tags are ignored. - - - Any other invalid markup is ignored, including unclosed SGML - comments and unclosed <![CDATA[blocks. -""" - -__all__ = ['parseLinkAttrs'] - -import re -from functools import partial - -flags = ( - # Match newlines with '.' - re.DOTALL | - re.IGNORECASE | - # Allow comments and whitespace in patterns - re.VERBOSE | - # Make \b respect Unicode word boundaries - re.UNICODE -) - -# Stuff to remove before we start looking for tags -removed_re = re.compile(r''' - # Comments - <!--.*?--> - - # CDATA blocks -| <!\[CDATA\[.*?\]\]> - - # script blocks -| <script\b - - # make sure script is not an XML namespace - (?!:) - - [^>]*>.*?</script> - -''', flags) - -tag_expr = r''' -# Starts with the tag name at a word boundary, where the tag name is -# not a namespace -<%(tag_name)s\b(?!:) - -# All of the stuff up to a ">", hopefully attributes. -(?P<attrs>[^>]*?) - -(?: # Match a short tag - /> - -| # Match a full tag - > - - (?P<contents>.*?) - - # Closed by - (?: # One of the specified close tags - </?%(closers)s\s*> - - # End of the string - | \Z - - ) - -) -''' - - -def tagMatcher(tag_name, *close_tags): - if close_tags: - options = '|'.join((tag_name,) + close_tags) - closers = '(?:%s)' % (options,) - else: - closers = tag_name - - expr = tag_expr % locals() - return re.compile(expr, flags) - - -# Must contain at least an open html and an open head tag -html_find = tagMatcher('html') -head_find = tagMatcher('head', 'body') -link_find = re.compile(r'<link\b(?!:)', flags) - -attr_find = re.compile(r''' -# Must start with a sequence of word-characters, followed by an equals sign -(?P<attr_name>\w+)= - -# Then either a quoted or unquoted attribute -(?: - - # Match everything that\'s between matching quote marks - (?P<qopen>["\'])(?P<q_val>.*?)(?P=qopen) -| - - # If the value is not quoted, match up to whitespace - (?P<unq_val>(?:[^\s<>/]|/(?!>))+) -) - -| - -(?P<end_link>[<>]) -''', flags) - -# Entity replacement: -replacements = { - 'amp': '&', - 'lt': '<', - 'gt': '>', - 'quot': '"', -} - -ent_replace = re.compile(r'&(%s);' % '|'.join(replacements.keys())) - - -def replaceEnt(mo): - "Replace the entities that are specified by OpenID" - return replacements.get(mo.group(1), mo.group()) - - -def parseLinkAttrs(html): - """Find all link tags in a string representing a HTML document and - return a list of their attributes. - - @param html: the text to parse - @type html: str or unicode - - @return: A list of dictionaries of attributes, one for each link tag - @rtype: [[(type(html), type(html))]] - """ - stripped = removed_re.sub('', html) - html_mo = html_find.search(stripped) - if html_mo is None or html_mo.start('contents') == -1: - return [] - - start, end = html_mo.span('contents') - head_mo = head_find.search(stripped, start, end) - if head_mo is None or head_mo.start('contents') == -1: - return [] - - start, end = head_mo.span('contents') - link_mos = link_find.finditer(stripped, head_mo.start(), head_mo.end()) - - matches = [] - for link_mo in link_mos: - start = link_mo.start() + 5 - link_attrs = {} - for attr_mo in attr_find.finditer(stripped, start): - if attr_mo.lastgroup == 'end_link': - break - - # Either q_val or unq_val must be present, but not both - # unq_val is a True (non-empty) value if it is present - attr_name, q_val, unq_val = attr_mo.group( - 'attr_name', 'q_val', 'unq_val') - attr_val = ent_replace.sub(replaceEnt, unq_val or q_val) - - link_attrs[attr_name] = attr_val - - matches.append(link_attrs) - - return matches - - -def relMatches(rel_attr, target_rel): - """Does this target_rel appear in the rel_str?""" - # XXX: TESTME - rels = rel_attr.strip().split() - for rel in rels: - rel = rel.lower() - if rel == target_rel: - return 1 - - return 0 - - -def linkHasRel(link_attrs, target_rel): - """Does this link have target_rel as a relationship?""" - # XXX: TESTME - rel_attr = link_attrs.get('rel') - return rel_attr and relMatches(rel_attr, target_rel) - - -def findLinksRel(link_attrs_list, target_rel): - """Filter the list of link attributes on whether it has target_rel - as a relationship.""" - # XXX: TESTME - matchesTarget = partial(linkHasRel, target_rel=target_rel) - return [i for i in link_attrs_list if matchesTarget(i)] - - -def findFirstHref(link_attrs_list, target_rel): - """Return the value of the href attribute for the first link tag - in the list that has target_rel as a relationship.""" - # XXX: TESTME - matches = findLinksRel(link_attrs_list, target_rel) - if not matches: - return None - first = matches[0] - return first.get('href') diff --git a/openid/test/linkparse.txt b/openid/test/linkparse.txt deleted file mode 100644 index 74c63ca..0000000 --- a/openid/test/linkparse.txt +++ /dev/null @@ -1,584 +0,0 @@ -Num Tests: 72 - -OpenID link parsing test cases -Copyright (C) 2005-2008, JanRain, Inc. -See COPYING for license information. - -File format ------------ - -All text before the first triple-newline (this chunk) should be ignored. - -This file may be interpreted as Latin-1 or UTF-8. - -Test cases separated by three line separators (`\n\n\n'). The test -cases consist of a headers section followed by a data block. These are -separated by a double newline. The headers consist of the header name, -followed by a colon, a space, the value, and a newline. There must be -one, and only one, `Name' header for a test case. There may be zero or -more link headers. The `Link' header consists of whitespace-separated -attribute pairs. A link header with an empty string as a value -indicates an empty but present link tag. The attribute pairs are `=' -separated and not quoted. - -Optional Links and attributes have a trailing `*'. A compilant -implementation may produce this as output or may not. A compliant -implementation will not produce any output that is absent from this -file. - - -Name: No link tag at all - -<html> -<head> -</head> -</html> - - -Name: Link element first - -<link> - - -Name: Link inside HTML, not head - -<html> -<link> - - -Name: Link inside head, not html - -<head> -<link> - - -Name: Link inside html, after head - -<html> -<head> -</head> -<link> - - -Name: Link inside html, before head - -<html> -<link> -<head> - - -Name: Link before html and head - -<link> -<html> -<head> - - -Name: Link after html document with head - -<html> -<head> -</head> -</html> -<link> - - -Name: Link inside html inside head, inside another html - -<html> -<head> -<html> -<link> - - -Name: Link inside html inside head - -<head> -<html> -<link> - - -Name: link inside body inside head inside html - -<html> -<head> -<body> -<link> - - -Name: Link inside head inside head inside html - -<html> -<head> -<head> -<link> - - -Name: Link inside script inside head inside html - -<html> -<head> -<script> -<link> -</script> - - -Name: Link inside comment inside head inside html - -<html> -<head/> -<link> - - -Name: Link inside of head after short head - -<html> -<head/> -<head> -<link> - - -Name: Plain vanilla -Link: - -<html> -<head> -<link> - - -Name: Ignore tags in the <script:... > namespace -Link*: - -<html> -<head> -<script:paddypan> -<link> -</script:paddypan> - - -Name: Short link tag -Link: - -<html> -<head> -<link/> - - -Name: Spaces in the HTML tag -Link: - -<html > -<head> -<link> - - -Name: Spaces in the head tag -Link: - -<html> -<head > -<link> - - -Name: Spaces in the link tag -Link: - -<html> -<head> -<link > - - -Name: No whitespace -Link: - -<html><head><link> - - -Name: Closed head tag -Link: - -<html> -<head> -<link> -</head> - - -Name: One good, one bad (after close head) -Link: - -<html> -<head> -<link> -</head> -<link> - - -Name: One good, one bad (after open body) -Link: - -<html> -<head> -<link> -<body> -<link> - - -Name: ill formed (missing close head) -Link: - -<html> -<head> -<link> -</html> - - -Name: Ill formed (no close head, link after </html>) -Link: - -<html> -<head> -<link> -</html> -<link> - - -Name: Ignore random tags inside of html -Link: - -<html> -<delicata> -<head> -<title> -<link> - - -Name: case-folding -Link*: - -<HtMl> -<hEaD> -<LiNk> - - -Name: unexpected tags -Link: - -<butternut> -<html> -<summer> -<head> -<turban> -<link> - - -Name: un-closed script tags -Link*: - -<html> -<head> -<script> -<link> - - -Name: un-closed script tags (no whitespace) -Link*: - -<html><head><script><link> - - -Name: un-closed comment -Link*: - -<html> -<head> -<!-- -<link> - - -Name: un-closed CDATA -Link*: - -<html> -<head> -<![CDATA[ -<link> - - -Name: cdata-like -Link*: - -<html> -<head> -<![ACORN[ -<link> -]]> - - -Name: comment close only -Link: - -<html> -<head> -<link> ---> - - -Name: Vanilla, two links -Link: -Link: - -<html> -<head> -<link> -<link> - - -Name: extra tag, two links -Link: -Link: - -<html> -<gold nugget> -<head> -<link> -<link> - - -Name: case-fold, body ends, two links -Link: -Link*: - -<html> -<head> -<link> -<LiNk> -<body> -<link> - - -Name: simple, non-quoted rel -Link: rel=openid.server - -<html><head><link rel=openid.server> - - -Name: short tag has rel -Link: rel=openid.server - -<html><head><link rel=openid.server/> - - -Name: short tag w/space has rel -Link: rel=openid.server - -<html><head><link rel=openid.server /> - - -Name: extra non-attribute, has rel -Link: rel=openid.server - -<html><head><link hubbard rel=openid.server> - - -Name: non-attr, has rel, short -Link: rel=openid.server - -<html><head><link hubbard rel=openid.server/> - - -Name: non-attr, has rel, short, space -Link: rel=openid.server - -<html><head><link hubbard rel=openid.server /> - - -Name: misplaced slash has rel -Link: rel=openid.server - -<html><head><link / rel=openid.server> - - -Name: quoted rel -Link: rel=openid.server - -<html><head><link rel="openid.server"> - - -Name: single-quoted rel -Link: rel=openid.server - -<html><head><link rel='openid.server'> - - -Name: two links w/ rel -Link: x=y -Link: a=b - -<html><head><link x=y><link a=b> - - -Name: non-entity -Link: x=&y - -<html><head><link x=&y> - - -Name: quoted non-entity -Link: x=&y - -<html><head><link x="&y"> - - -Name: quoted entity -Link: x=& - -<html><head><link x="&"> - - -Name: entity not processed -Link: x= - -<html><head><link x=""> - - -Name: < -Link: x=< - -<html><head><link x="<"> - - -Name: > -Link: x=> - -<html><head><link x=">"> - - -Name: " -Link: x=" - -<html><head><link x="""> - - -Name: &" -Link: x=&" - -<html><head><link x="&""> - - -Name: mixed entity and non-entity -Link: x=&"…> - -<html><head><link x="&"…>"> - - -Name: mixed entity and non-entity (w/normal chars) -Link: x=x&"…>x - -<html><head><link x="x&"…>x"> - - -Name: broken tags -Link*: x=y - -<html><head><link x=y<> - - -Name: missing close pointy -Link*: x=y -Link: z=y - -<html><head><link x=y<link z=y /> - - -Name: missing attribute value -Link: x=y y*= -Link: x=y - -<html><head><link x=y y=><link x=y /> - - -Name: Missing close pointy (no following) -Link*: x=y - -<html><head><link x=y - - -Name: Should be quoted -Link*: x=< - -<html><head><link x="<"> - - -Name: Should be quoted (2) -Link*: x=> - -<html><head><link x=">"> - - -Name: Repeated attribute -Link: x=y - -<html><head><link x=z x=y> - - -Name: Repeated attribute (2) -Link: x=y - -<html><head><link x=y x=y> - - -Name: Two attributes -Link: x=y y=z - -<html><head><link x=y y=z> - - -Name: Well-formed link rel="openid.server" -Link: rel=openid.server href=http://www.myopenid.com/server - -<html> - <head> - <link rel="openid.server" - href="http://www.myopenid.com/server" /> - </head> -</html> - - -Name: Well-formed link rel="openid.server" and "openid.delegate" -Link: rel=openid.server href=http://www.myopenid.com/server -Link: rel=openid.delegate href=http://example.myopenid.com/ - -<html><head><link rel="openid.server" - href="http://www.myopenid.com/server" /> - <link rel="openid.delegate" href="http://example.myopenid.com/" /> -</head></html> - - -Name: from brian's livejournal page -Link: rel=stylesheet href=http://www.livejournal.com/~serotta/res/319998/stylesheet?1130478711 type=text/css -Link: rel=openid.server href=http://www.livejournal.com/openid/server.bml - -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" - "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> - <head> - <link rel="stylesheet" - href="http://www.livejournal.com/~serotta/res/319998/stylesheet?1130478711" - type="text/css" /> - <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> - <meta name="foaf:maker" - content="foaf:mbox_sha1sum '12f8abdacb5b1a806711e23249da592c0d316260'" /> - <meta name="robots" content="noindex, nofollow, noarchive" /> - <meta name="googlebot" content="nosnippet" /> - <link rel="openid.server" - href="http://www.livejournal.com/openid/server.bml" /> - <title>Brian</title> - </head> - - -Name: non-ascii (Latin-1 or UTF8) -Link: x=® - -<html><head><link x="®"> - - diff --git a/openid/test/test_htmldiscover.py b/openid/test/test_htmldiscover.py index 65b036f..b4caeb3 100644 --- a/openid/test/test_htmldiscover.py +++ b/openid/test/test_htmldiscover.py @@ -3,14 +3,15 @@ import unittest from openid.consumer.discover import OpenIDServiceEndpoint -class BadLinksTestCase(unittest.TestCase): - cases = [ - '', - "http://not.in.a.link.tag/", - '<link rel="openid.server" href="not.in.html.or.head" />', - ] +class TestFromHTML(unittest.TestCase): + """Test `OpenIDServiceEndpoint.fromHTML`.""" - def test_from_html(self): - for html in self.cases: - actual = OpenIDServiceEndpoint.fromHTML('http://unused.url/', html) - self.assertEqual(actual, []) + def test_empty(self): + self.assertEqual(OpenIDServiceEndpoint.fromHTML('http://example.url/', ''), []) + + def test_invalid_html(self): + self.assertEqual(OpenIDServiceEndpoint.fromHTML('http://example.url/', "http://not.in.a.link.tag/"), []) + + def test_no_op_url(self): + html = '<html><head><link rel="openid.server"></head></html>' + self.assertEqual(OpenIDServiceEndpoint.fromHTML('http://example.url/', html), []) diff --git a/openid/test/test_linkparse.py b/openid/test/test_linkparse.py deleted file mode 100644 index 077caaf..0000000 --- a/openid/test/test_linkparse.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Test `openid.consumer.html_parse` module.""" -import os.path -import unittest - -from openid.consumer.html_parse import parseLinkAttrs - - -def parseLink(line): - parts = line.split() - optional = parts[0] == 'Link*:' - assert optional or parts[0] == 'Link:' - - attrs = {} - for attr in parts[1:]: - k, v = attr.split('=', 1) - if k[-1] == '*': - attr_optional = 1 - k = k[:-1] - else: - attr_optional = 0 - - attrs[k] = (attr_optional, v) - - return (optional, attrs) - - -def parseCase(s): - header, markup = s.split('\n\n', 1) - lines = header.split('\n') - name = lines.pop(0) - assert name.startswith('Name: ') - desc = name[6:] - return desc, markup, [parseLink(l) for l in lines] - - -def parseTests(s): - tests = [] - - cases = s.split('\n\n\n') - header = cases.pop(0) - tests_line, _ = header.split('\n', 1) - k, v = tests_line.split(': ') - assert k == 'Num Tests' - num_tests = int(v) - - for case in cases[:-1]: - desc, markup, links = parseCase(case) - tests.append((desc, markup, links, case)) - - assert len(tests) == num_tests, (len(tests), num_tests) - return num_tests, tests - - -with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'linkparse.txt')) as link_test_data_file: - link_test_data = link_test_data_file.read().decode('utf-8') - - -class LinkTest(unittest.TestCase): - """Test `parseLinkAttrs` function.""" - - def runTest(self): - num_tests, test_cases = parseTests(link_test_data) - - for desc, case, expected, raw in test_cases: - actual = parseLinkAttrs(case) - i = 0 - for optional, exp_link in expected: - if optional: - if i >= len(actual): - continue - - act_link = actual[i] - for k, (o, v) in exp_link.items(): - if o: - act_v = act_link.get(k) - if act_v is None: - continue - else: - act_v = act_link[k] - - if optional and v != act_v: - break - - self.assertEqual(v, act_v) - else: - i += 1 - - assert i == len(actual) |