From e0b3152a799e7bb04770fcddf010b22c0b05379e Mon Sep 17 00:00:00 2001 From: Iwan Aucamp Date: Wed, 17 May 2023 18:51:55 +0200 Subject: fix: HTTP 308 Permanent Redirect status code handling (#2389) Change the handling of HTTP status code 308 to behave more like `urllib.request.HTTPRedirectHandler`, most critically, the new 308 handling will create a new `urllib.request.Request` object with the new URL, which will prevent state from being carried over from the original request. One case where this is important is when the domain name changes, for example, when the original URL is `http://www.w3.org/ns/adms.ttl` and the redirect URL is `https://uri.semic.eu/w3c/ns/adms.ttl`. With the previous behaviour, the redirect would contain a `Host` header with the value `www.w3.org` instead of `uri.semic.eu` because the `Host` header is placed in `Request.unredirected_hdrs` and takes precedence over the `Host` header in `Request.headers`. Other changes: - Only handle HTTP status code 308 on Python versions before 3.11 as Python 3.11 will handle 308 by default [[ref](https://docs.python.org/3.11/whatsnew/changelog.html#id128)]. - Move code which uses `http://www.w3.org/ns/adms.ttl` and `http://www.w3.org/ns/adms.rdf` out of `test_guess_format_for_parse` into a separate parameterized test, which instead uses the embedded http server. This allows the test to fully control the `Content-Type` header in the response instead of relying on the value that the server is sending. This is needed because the server is sending `Content-Type: text/plain` for the `adms.ttl` file, which is not a valid RDF format, and the test is expecting `Content-Type: text/turtle`. Fixes: - . --- rdflib/_networking.py | 117 ++++++++++++ rdflib/parser.py | 19 +- test/conftest.py | 34 +++- test/data.py | 18 ++ test/data/defined_namespaces/adms.rdf | 277 +++++++++++++++++++++++++++++ test/data/defined_namespaces/adms.ttl | 175 ++++++++++++++++++ test/data/defined_namespaces/rdfs.rdf | 130 ++++++++++++++ test/data/fetcher.py | 15 ++ test/test_graph/test_graph.py | 61 ++++++- test/test_graph/test_graph_redirect.py | 45 +++++ test/test_misc/test_input_source.py | 17 +- test/test_misc/test_networking_redirect.py | 217 ++++++++++++++++++++++ test/utils/exceptions.py | 29 +++ test/utils/http.py | 9 + test/utils/httpfileserver.py | 10 +- 15 files changed, 1121 insertions(+), 52 deletions(-) create mode 100644 rdflib/_networking.py create mode 100644 test/data/defined_namespaces/adms.rdf create mode 100644 test/data/defined_namespaces/adms.ttl create mode 100644 test/data/defined_namespaces/rdfs.rdf create mode 100644 test/test_graph/test_graph_redirect.py create mode 100644 test/test_misc/test_networking_redirect.py create mode 100644 test/utils/exceptions.py diff --git a/rdflib/_networking.py b/rdflib/_networking.py new file mode 100644 index 00000000..311096a8 --- /dev/null +++ b/rdflib/_networking.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import string +import sys +from typing import Dict +from urllib.error import HTTPError +from urllib.parse import quote as urlquote +from urllib.parse import urljoin, urlsplit +from urllib.request import HTTPRedirectHandler, Request, urlopen +from urllib.response import addinfourl + + +def _make_redirect_request(request: Request, http_error: HTTPError) -> Request: + """ + Create a new request object for a redirected request. + + The logic is based on `urllib.request.HTTPRedirectHandler` from `this commit _`. + + :param request: The original request that resulted in the redirect. + :param http_error: The response to the original request that indicates a + redirect should occur and contains the new location. + :return: A new request object to the location indicated by the response. + :raises HTTPError: the supplied ``http_error`` if the redirect request + cannot be created. + :raises ValueError: If the response code is `None`. + :raises ValueError: If the response does not contain a ``Location`` header + or the ``Location`` header is not a string. + :raises HTTPError: If the scheme of the new location is not ``http``, + ``https``, or ``ftp``. + :raises HTTPError: If there are too many redirects or a redirect loop. + """ + new_url = http_error.headers.get("Location") + if new_url is None: + raise http_error + if not isinstance(new_url, str): + raise ValueError(f"Location header {new_url!r} is not a string") + + new_url_parts = urlsplit(new_url) + + # For security reasons don't allow redirection to anything other than http, + # https or ftp. + if new_url_parts.scheme not in ("http", "https", "ftp", ""): + raise HTTPError( + new_url, + http_error.code, + f"{http_error.reason} - Redirection to url {new_url!r} is not allowed", + http_error.headers, + http_error.fp, + ) + + # http.client.parse_headers() decodes as ISO-8859-1. Recover the original + # bytes and percent-encode non-ASCII bytes, and any special characters such + # as the space. + new_url = urlquote(new_url, encoding="iso-8859-1", safe=string.punctuation) + new_url = urljoin(request.full_url, new_url) + + # XXX Probably want to forget about the state of the current + # request, although that might interact poorly with other + # handlers that also use handler-specific request attributes + content_headers = ("content-length", "content-type") + newheaders = { + k: v for k, v in request.headers.items() if k.lower() not in content_headers + } + new_request = Request( + new_url, + headers=newheaders, + origin_req_host=request.origin_req_host, + unverifiable=True, + ) + + visited: Dict[str, int] + if hasattr(request, "redirect_dict"): + visited = request.redirect_dict + if ( + visited.get(new_url, 0) >= HTTPRedirectHandler.max_repeats + or len(visited) >= HTTPRedirectHandler.max_redirections + ): + raise HTTPError( + request.full_url, + http_error.code, + HTTPRedirectHandler.inf_msg + http_error.reason, + http_error.headers, + http_error.fp, + ) + else: + visited = {} + setattr(request, "redirect_dict", visited) + + setattr(new_request, "redirect_dict", visited) + visited[new_url] = visited.get(new_url, 0) + 1 + return new_request + + +def _urlopen(request: Request) -> addinfourl: + """ + This is a shim for `urlopen` that handles HTTP redirects with status code + 308 (Permanent Redirect). + + This function should be removed once all supported versions of Python + handles the 308 HTTP status code. + + :param request: The request to open. + :return: The response to the request. + """ + try: + return urlopen(request) + except HTTPError as error: + if error.code == 308 and sys.version_info < (3, 11): + # HTTP response code 308 (Permanent Redirect) is not supported by python + # versions older than 3.11. See and + # for more details. + # This custom error handling should be removed once all supported + # versions of Python handles 308. + new_request = _make_redirect_request(request, error) + return _urlopen(new_request) + else: + raise diff --git a/rdflib/parser.py b/rdflib/parser.py index 6cf6f1da..a35c1d82 100644 --- a/rdflib/parser.py +++ b/rdflib/parser.py @@ -27,13 +27,13 @@ from typing import ( Tuple, Union, ) -from urllib.error import HTTPError from urllib.parse import urljoin -from urllib.request import Request, url2pathname, urlopen +from urllib.request import Request, url2pathname from xml.sax import xmlreader import rdflib.util from rdflib import __version__ +from rdflib._networking import _urlopen from rdflib.namespace import Namespace from rdflib.term import URIRef @@ -267,21 +267,6 @@ class URLInputSource(InputSource): req = Request(system_id, None, myheaders) # type: ignore[arg-type] - def _urlopen(req: Request) -> Any: - try: - return urlopen(req) - except HTTPError as ex: - # 308 (Permanent Redirect) is not supported by current python version(s) - # See https://bugs.python.org/issue40321 - # This custom error handling should be removed once all - # supported versions of python support 308. - if ex.code == 308: - # type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str") - req.full_url = ex.headers.get("Location") # type: ignore[assignment] - return _urlopen(req) - else: - raise - response: addinfourl = _urlopen(req) self.url = response.geturl() # in case redirections took place self.links = self.get_links(response) diff --git a/test/conftest.py b/test/conftest.py index 2f61c9fe..38f4dabc 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -44,22 +44,44 @@ def rdfs_graph() -> Graph: return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle") +_ServedBaseHTTPServerMocks = Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock] + + @pytest.fixture(scope="session") -def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]: +def _session_function_httpmocks() -> Generator[_ServedBaseHTTPServerMocks, None, None]: """ This fixture is session scoped, but it is reset for each function in :func:`function_httpmock`. This should not be used directly. """ - with ServedBaseHTTPServerMock() as httpmock: - yield httpmock + with ServedBaseHTTPServerMock() as httpmock_a, ServedBaseHTTPServerMock() as httpmock_b: + yield httpmock_a, httpmock_b @pytest.fixture(scope="function") def function_httpmock( - _session_function_httpmock: ServedBaseHTTPServerMock, + _session_function_httpmocks: _ServedBaseHTTPServerMocks, ) -> Generator[ServedBaseHTTPServerMock, None, None]: - _session_function_httpmock.reset() - yield _session_function_httpmock + """ + HTTP server mock that is reset for each test function. + """ + (mock, _) = _session_function_httpmocks + mock.reset() + yield mock + + +@pytest.fixture(scope="function") +def function_httpmocks( + _session_function_httpmocks: _ServedBaseHTTPServerMocks, +) -> Generator[Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock], None, None]: + """ + Alternative HTTP server mock that is reset for each test function. + + This exists in case a tests needs to work with two different HTTP servers. + """ + (mock_a, mock_b) = _session_function_httpmocks + mock_a.reset() + mock_b.reset() + yield mock_a, mock_b @pytest.fixture(scope="session", autouse=True) diff --git a/test/data.py b/test/data.py index f1271aae..779c522a 100644 --- a/test/data.py +++ b/test/data.py @@ -1,6 +1,7 @@ from pathlib import Path from rdflib import URIRef +from rdflib.graph import Graph TEST_DIR = Path(__file__).parent TEST_DATA_DIR = TEST_DIR / "data" @@ -19,3 +20,20 @@ cheese = URIRef("urn:example:cheese") context0 = URIRef("urn:example:context-0") context1 = URIRef("urn:example:context-1") context2 = URIRef("urn:example:context-2") + + +simple_triple_graph = Graph().add( + ( + URIRef("http://example.org/subject"), + URIRef("http://example.org/predicate"), + URIRef("http://example.org/object"), + ) +) +""" +A simple graph with a single triple. This is equivalent to the following RDF files: + +* ``test/data/variants/simple_triple.nq`` +* ``test/data/variants/simple_triple.nt`` +* ``test/data/variants/simple_triple.ttl`` +* ``test/data/variants/simple_triple.xml`` +""" diff --git a/test/data/defined_namespaces/adms.rdf b/test/data/defined_namespaces/adms.rdf new file mode 100644 index 00000000..cb56922a --- /dev/null +++ b/test/data/defined_namespaces/adms.rdf @@ -0,0 +1,277 @@ + + + + 2023-04-05 + + + + + Semantic Interoperability Community (SEMIC) + + + adms + adms + + + Bert + Van Nuffelen + + + + TenForce + + + + + + + Natasa + Sofou + + + + + Pavlina + Fragkou + + + SEMIC EU + + + + + + + Makx + Dekkers + + + + + Pavlina + Fragkou + + + SEMIC EU + + + + + + + An abstract entity that reflects the intellectual content of the asset and represents those characteristics of the asset that are independent of its physical embodiment. This abstract entity combines the FRBR entities work (a distinct intellectual or artistic creation) and expression (the intellectual or artistic realization of a work) + + Asset + + + A particular physical embodiment of an Asset, which is an example of the FRBR entity manifestation (the physical embodiment of an expression of a work). + + Asset Distribution + + + A system or service that provides facilities for storage and maintenance of descriptions of Assets and Asset Distributions, and functionality that allows users to search and access these descriptions. An Asset Repository will typically contain descriptions of several Assets and related Asset Distributions. + + Asset repository + + + This is based on the UN/CEFACT Identifier class. + + Identifier + + + Links a resource to an adms:Identifier class. + + + identifier + + + + An Asset that is contained in the Asset being described, e.g. when there are several vocabularies defined in a single document. + + + included asset + + + + The interoperability level for which the Asset is relevant. + + + interoperability level + + + + A link to the current or latest version of the Asset. + + + last + + + + + A link to the next version of the Asset. + + + next + + + + + A link to the previous version of the Asset. + + + prev + + + + + More information about the format in which an Asset Distribution is released. This is different from the file format as, for example, a ZIP file (file format) could contain an XML schema (representation technique). + + + representation technique + + + + Links to a sample of an Asset (which is itself an Asset). + + + sample + + + + The name of the agency that issued the identifier. + + + schema agency + + + + The status of the Asset in the context of a particular workflow process. + + + status + + + + A schema according to which the Asset Repository can provide data about its content, e.g. ADMS. + + + supported schema + + + + Links Assets that are translations of each other. + + + translation + + + + A description of changes between this version and the previous version of the Asset. + + + version notes + + + diff --git a/test/data/defined_namespaces/adms.ttl b/test/data/defined_namespaces/adms.ttl new file mode 100644 index 00000000..86561101 --- /dev/null +++ b/test/data/defined_namespaces/adms.ttl @@ -0,0 +1,175 @@ +@prefix rdf: . + + + "2023-04-05" ; + ; + [ + ; + "Semantic Interoperability Community (SEMIC)" + ] ; + a ; + "adms"@en, "adms"@nl ; + [ + a ; + "Bert" ; + "Van Nuffelen" ; + ; + [ + "TenForce" + ] + ], [ + a ; + "Natasa" ; + "Sofou" + ], [ + a ; + "Pavlina" ; + "Fragkou" ; + [ + "SEMIC EU" + ] + ], [ + a ; + "Makx" ; + "Dekkers" + ] ; + [ + a ; + "Pavlina" ; + "Fragkou" ; + [ + "SEMIC EU" + ] + ] . + + + a ; + "An abstract entity that reflects the intellectual content of the asset and represents those characteristics of the asset that are independent of its physical embodiment. This abstract entity combines the FRBR entities work (a distinct intellectual or artistic creation) and expression (the intellectual or artistic realization of a work)"@en ; + ; + "Asset"@en . + + + a ; + "A particular physical embodiment of an Asset, which is an example of the FRBR entity manifestation (the physical embodiment of an expression of a work)."@en ; + ; + "Asset Distribution"@en . + + + a ; + "A system or service that provides facilities for storage and maintenance of descriptions of Assets and Asset Distributions, and functionality that allows users to search and access these descriptions. An Asset Repository will typically contain descriptions of several Assets and related Asset Distributions."@en ; + ; + "Asset repository"@en . + + + a ; + "This is based on the UN/CEFACT Identifier class."@en ; + ; + "Identifier"@en . + + + a ; + "Links a resource to an adms:Identifier class."@en ; + ; + ; + "identifier"@en ; + . + + + a ; + "An Asset that is contained in the Asset being described, e.g. when there are several vocabularies defined in a single document."@en ; + ; + ; + "included asset"@en ; + . + + + a ; + "The interoperability level for which the Asset is relevant."@en ; + ; + ; + "interoperability level"@en ; + . + + + a ; + "A link to the current or latest version of the Asset."@en ; + ; + ; + "last"@en ; + ; + . + + + a ; + "A link to the next version of the Asset."@en ; + ; + ; + "next"@en ; + ; + . + + + a ; + "A link to the previous version of the Asset."@en ; + ; + ; + "prev"@en ; + ; + . + + + a ; + "More information about the format in which an Asset Distribution is released. This is different from the file format as, for example, a ZIP file (file format) could contain an XML schema (representation technique)."@en ; + ; + ; + "representation technique"@en ; + . + + + a ; + "Links to a sample of an Asset (which is itself an Asset)."@en ; + ; + ; + "sample"@en ; + . + + + a ; + "The name of the agency that issued the identifier."@en ; + ; + ; + "schema agency"@en ; + . + + + a ; + "The status of the Asset in the context of a particular workflow process."@en ; + ; + ; + "status"@en ; + . + + + a ; + "A schema according to which the Asset Repository can provide data about its content, e.g. ADMS."@en ; + ; + ; + "supported schema"@en ; + . + + + a ; + "Links Assets that are translations of each other."@en ; + ; + ; + "translation"@en ; + . + + + a ; + "A description of changes between this version and the previous version of the Asset."@en ; + ; + ; + "version notes"@en ; + . + diff --git a/test/data/defined_namespaces/rdfs.rdf b/test/data/defined_namespaces/rdfs.rdf new file mode 100644 index 00000000..bf17bab0 --- /dev/null +++ b/test/data/defined_namespaces/rdfs.rdf @@ -0,0 +1,130 @@ + + + + + + + Resource + The class resource, everything. + + + + + Class + The class of classes. + + + + + + subClassOf + The subject is a subclass of a class. + + + + + + + subPropertyOf + The subject is a subproperty of a property. + + + + + + + comment + A description of the subject resource. + + + + + + + label + A human-readable name for the subject. + + + + + + + domain + A domain of the subject property. + + + + + + + range + A range of the subject property. + + + + + + + seeAlso + Further information about the subject resource. + + + + + + + + isDefinedBy + The defininition of the subject resource. + + + + + + + Literal + The class of literal values, eg. textual strings and integers. + + + + + + Container + + The class of RDF containers. + + + + + ContainerMembershipProperty + The class of container membership properties, rdf:_1, rdf:_2, ..., + all of which are sub-properties of 'member'. + + + + + + member + A member of the subject resource. + + + + + + + Datatype + The class of RDF datatypes. + + + + + + + + diff --git a/test/data/fetcher.py b/test/data/fetcher.py index 7c9e4ff0..1ea8e337 100755 --- a/test/data/fetcher.py +++ b/test/data/fetcher.py @@ -248,6 +248,21 @@ RESOURCES: List[Resource] = [ ), local_path=(DATA_PATH / "defined_namespaces/rdfs.ttl"), ), + FileResource( + remote=Request( + "http://www.w3.org/2000/01/rdf-schema#", + headers={"Accept": "application/rdf+xml"}, + ), + local_path=(DATA_PATH / "defined_namespaces/rdfs.rdf"), + ), + FileResource( + remote=Request("http://www.w3.org/ns/adms.rdf"), + local_path=(DATA_PATH / "defined_namespaces/adms.rdf"), + ), + FileResource( + remote=Request("http://www.w3.org/ns/adms.ttl"), + local_path=(DATA_PATH / "defined_namespaces/adms.ttl"), + ), FileResource( remote=Request("https://www.w3.org/ns/rdftest.ttl"), local_path=(DATA_PATH / "defined_namespaces/rdftest.ttl"), diff --git a/test/test_graph/test_graph.py b/test/test_graph/test_graph.py index b133c2b5..289d577a 100644 --- a/test/test_graph/test_graph.py +++ b/test/test_graph/test_graph.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- import logging import os +from contextlib import ExitStack from pathlib import Path from test.data import TEST_DATA_DIR, bob, cheese, hates, likes, michel, pizza, tarek from test.utils import GraphHelper, get_unique_plugin_names +from test.utils.exceptions import ExceptionChecker from test.utils.httpfileserver import HTTPFileServer, ProtoFileResource -from typing import Callable, Optional, Set +from typing import Callable, Optional, Set, Tuple, Union from urllib.error import HTTPError, URLError import pytest @@ -342,14 +344,6 @@ def test_guess_format_for_parse( # only getting HTML with pytest.raises(PluginException): graph.parse(location=file_info.request_url) - - try: - graph.parse(location="http://www.w3.org/ns/adms.ttl") - graph.parse(location="http://www.w3.org/ns/adms.rdf") - except (URLError, HTTPError): - # this endpoint is currently not available, ignore this test. - pass - try: # persistent Australian Government online RDF resource without a file-like ending graph.parse(location="https://linked.data.gov.au/def/agrif?_format=text/turtle") @@ -358,6 +352,55 @@ def test_guess_format_for_parse( pass +@pytest.mark.parametrize( + ("file", "content_type", "expected_result"), + ( + (TEST_DATA_DIR / "defined_namespaces/adms.rdf", "application/rdf+xml", 132), + (TEST_DATA_DIR / "defined_namespaces/adms.ttl", "text/turtle", 132), + (TEST_DATA_DIR / "defined_namespaces/adms.ttl", None, 132), + ( + TEST_DATA_DIR / "defined_namespaces/adms.rdf", + None, + ExceptionChecker( + ParserError, + r"Could not guess RDF format .* from file extension so tried Turtle", + ), + ), + ), +) +def test_guess_format_for_parse_http( + make_graph: GraphFactory, + http_file_server: HTTPFileServer, + file: Path, + content_type: Optional[str], + expected_result: Union[int, ExceptionChecker], +) -> None: + graph = make_graph() + headers: Tuple[Tuple[str, str], ...] = tuple() + if content_type is not None: + headers = (("Content-Type", content_type),) + + file_info = http_file_server.add_file_with_caching( + ProtoFileResource(headers, file), + suffix=f"/{file.name}", + ) + catcher: Optional[pytest.ExceptionInfo[Exception]] = None + + assert 0 == len(graph) + with ExitStack() as exit_stack: + if isinstance(expected_result, ExceptionChecker): + catcher = exit_stack.enter_context(pytest.raises(expected_result.type)) + graph.parse(location=file_info.request_url) + + if catcher is not None: + # assert catcher.value is not None + assert isinstance(expected_result, ExceptionChecker) + logging.debug("graph = %s", list(graph.triples((None, None, None)))) + else: + assert isinstance(expected_result, int) + assert expected_result == len(graph) + + def test_parse_file_uri(make_graph: GraphFactory): EG = Namespace("http://example.org/#") g = make_graph() diff --git a/test/test_graph/test_graph_redirect.py b/test/test_graph/test_graph_redirect.py new file mode 100644 index 00000000..c61adbc5 --- /dev/null +++ b/test/test_graph/test_graph_redirect.py @@ -0,0 +1,45 @@ +from test.data import TEST_DATA_DIR, simple_triple_graph +from test.utils import GraphHelper +from test.utils.http import MethodName, MockHTTPResponse +from test.utils.httpservermock import ServedBaseHTTPServerMock +from typing import Tuple +from urllib.parse import urlparse + +from rdflib.graph import Graph + + +def test_graph_redirect_new_host( + function_httpmocks: Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock] +) -> None: + """ + Redirect to new host results in a request with the right Host header + parameter. + """ + + mock_a, mock_b = function_httpmocks + + mock_a.responses[MethodName.GET].append( + MockHTTPResponse( + 308, + "Permanent Redirect", + b"", + {"Location": [f"{mock_b.url}/b/data.ttl"]}, + ) + ) + + mock_b.responses[MethodName.GET].append( + MockHTTPResponse( + 200, + "OK", + (TEST_DATA_DIR / "variants" / "simple_triple.ttl").read_bytes(), + {"Content-Type": ["text/turtle"]}, + ) + ) + + graph = Graph() + graph.parse(location=f"{mock_a.url}/a/data.ttl") + GraphHelper.assert_sets_equals(graph, simple_triple_graph) + for mock in function_httpmocks: + assert 1 == len(mock.requests[MethodName.GET]) + for request in mock.requests[MethodName.GET]: + assert request.headers["Host"] == urlparse(mock.url).netloc diff --git a/test/test_misc/test_input_source.py b/test/test_misc/test_input_source.py index f3da062b..90e6e238 100644 --- a/test/test_misc/test_input_source.py +++ b/test/test_misc/test_input_source.py @@ -11,6 +11,7 @@ from dataclasses import dataclass # from itertools import product from pathlib import Path from test.utils import GraphHelper +from test.utils.exceptions import ExceptionChecker from test.utils.httpfileserver import ( HTTPFileInfo, HTTPFileServer, @@ -27,7 +28,6 @@ from typing import ( # Callable, Generic, Iterable, Optional, - Pattern, TextIO, Tuple, Type, @@ -251,21 +251,6 @@ def call_create_input_source( yield input_source -@dataclass -class ExceptionChecker: - type: Type[Exception] - pattern: Optional[Pattern[str]] = None - - def check(self, exception: Exception) -> None: - try: - assert isinstance(exception, self.type) - if self.pattern is not None: - assert self.pattern.match(f"{exception}") - except Exception: - logging.error("problem checking exception", exc_info=exception) - raise - - AnyT = TypeVar("AnyT") diff --git a/test/test_misc/test_networking_redirect.py b/test/test_misc/test_networking_redirect.py new file mode 100644 index 00000000..acde10d7 --- /dev/null +++ b/test/test_misc/test_networking_redirect.py @@ -0,0 +1,217 @@ +from contextlib import ExitStack +from copy import deepcopy +from test.utils.exceptions import ExceptionChecker +from test.utils.http import headers_as_message as headers_as_message +from typing import Any, Dict, Iterable, Optional, Type, TypeVar, Union +from urllib.error import HTTPError +from urllib.request import HTTPRedirectHandler, Request + +import pytest +from _pytest.mark.structures import ParameterSet + +from rdflib._networking import _make_redirect_request + +AnyT = TypeVar("AnyT") + + +def with_attrs(object: AnyT, **kwargs: Any) -> AnyT: + for key, value in kwargs.items(): + setattr(object, key, value) + return object + + +class RaisesIdentity: + pass + + +def generate_make_redirect_request_cases() -> Iterable[ParameterSet]: + yield pytest.param( + Request("http://example.com/data.ttl"), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({}), + None, + ), + RaisesIdentity, + {}, + id="Exception passes through if no Location header is present", + ) + yield pytest.param( + Request("http://example.com/data.ttl"), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": [100]}), # type: ignore[arg-type] + None, + ), + ExceptionChecker(ValueError, "Location header 100 is not a string"), + {}, + id="Location must be a string", + ) + yield pytest.param( + Request("http://example.com/data.ttl"), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": ["example:data.ttl"]}), + None, + ), + ExceptionChecker( + HTTPError, + "HTTP Error 308: Permanent Redirect - Redirection to url 'example:data.ttl' is not allowed", + {"code": 308}, + ), + {}, + id="Error passes through with a slight alterations if the Location header is not a supported URL", + ) + + url_prefix = "http://example.com" + for request_url_suffix, redirect_location, new_url_suffix in [ + ("/data.ttl", "", "/data.ttl"), + ("", "", ""), + ("/data.ttl", "a", "/a"), + ("", "a", "/a"), + ("/a/b/c/", ".", "/a/b/c/"), + ("/a/b/c", ".", "/a/b/"), + ("/a/b/c/", "..", "/a/b/"), + ("/a/b/c", "..", "/a/"), + ("/a/b/c/", "/", "/"), + ("/a/b/c/", "/x/", "/x/"), + ("/a/b/c/", "/x/y", "/x/y"), + ("/a/b/c/", f"{url_prefix}", ""), + ("/a/b/c/", f"{url_prefix}/", "/"), + ("/a/b/c/", f"{url_prefix}/a/../b", "/a/../b"), + ("/", f"{url_prefix}/ /data.ttl", "/%20%20%20/data.ttl"), + ]: + request_url = f"http://example.com{request_url_suffix}" + new_url = f"http://example.com{new_url_suffix}" + yield pytest.param( + Request(request_url), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": [redirect_location]}), + None, + ), + Request(new_url, unverifiable=True), + {new_url: 1}, + id=f"Redirect from {request_url!r} to {redirect_location!r} is correctly handled", + ) + + yield pytest.param( + Request( + "http://example.com/data.ttl", + b"foo", + headers={ + "Content-Type": "text/plain", + "Content-Length": "3", + "Accept": "text/turtle", + }, + ), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": ["http://example.org/data.ttl"]}), + None, + ), + Request( + "http://example.org/data.ttl", + headers={"Accept": "text/turtle"}, + origin_req_host="example.com", + unverifiable=True, + ), + {"http://example.org/data.ttl": 1}, + id="Headers transfer correctly", + ) + + yield pytest.param( + with_attrs( + Request( + "http://example.com/data1.ttl", + ), + redirect_dict=dict( + (f"http://example.com/redirect/{index}", 1) + for index in range(HTTPRedirectHandler.max_redirections) + ), + ), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": ["http://example.org/data2.ttl"]}), + None, + ), + ExceptionChecker( + HTTPError, + f"HTTP Error 308: {HTTPRedirectHandler.inf_msg}Permanent Redirect", + ), + {}, + id="Max redirects is respected", + ) + + yield pytest.param( + with_attrs( + Request( + "http://example.com/data1.ttl", + ), + redirect_dict={ + "http://example.org/data2.ttl": HTTPRedirectHandler.max_repeats + }, + ), + HTTPError( + "", + 308, + "Permanent Redirect", + headers_as_message({"Location": ["http://example.org/data2.ttl"]}), + None, + ), + ExceptionChecker( + HTTPError, + f"HTTP Error 308: {HTTPRedirectHandler.inf_msg}Permanent Redirect", + ), + {}, + id="Max repeats is respected", + ) + + +@pytest.mark.parametrize( + ("http_request", "http_error", "expected_result", "expected_redirect_dict"), + generate_make_redirect_request_cases(), +) +def test_make_redirect_request( + http_request: Request, + http_error: HTTPError, + expected_result: Union[Type[RaisesIdentity], ExceptionChecker, Request], + expected_redirect_dict: Dict[str, int], +) -> None: + """ + `_make_redirect_request` correctly handles redirects. + """ + catcher: Optional[pytest.ExceptionInfo[Exception]] = None + result: Optional[Request] = None + with ExitStack() as stack: + if isinstance(expected_result, ExceptionChecker): + catcher = stack.enter_context(pytest.raises(expected_result.type)) + elif expected_result is RaisesIdentity: + catcher = stack.enter_context(pytest.raises(HTTPError)) + result = _make_redirect_request(http_request, http_error) + + if isinstance(expected_result, ExceptionChecker): + assert catcher is not None + expected_result.check(catcher.value) + elif isinstance(expected_result, type): + assert catcher is not None + assert http_error is catcher.value + else: + assert expected_redirect_dict == getattr(result, "redirect_dict", None) + assert expected_redirect_dict == getattr(http_request, "redirect_dict", None) + check = deepcopy(expected_result) + check.unverifiable = True + check = with_attrs(check, redirect_dict=expected_redirect_dict) + assert vars(check) == vars(result) diff --git a/test/utils/exceptions.py b/test/utils/exceptions.py new file mode 100644 index 00000000..a814f9b4 --- /dev/null +++ b/test/utils/exceptions.py @@ -0,0 +1,29 @@ +import logging +import re +from dataclasses import dataclass +from typing import Any, Dict, Optional, Pattern, Type, Union + + +@dataclass(frozen=True) +class ExceptionChecker: + type: Type[Exception] + pattern: Optional[Union[Pattern[str], str]] = None + attributes: Optional[Dict[str, Any]] = None + + def check(self, exception: Exception) -> None: + logging.debug("checking exception %s/%r", type(exception), exception) + pattern = self.pattern + if pattern is not None and not isinstance(pattern, re.Pattern): + pattern = re.compile(pattern) + try: + assert isinstance(exception, self.type) + if pattern is not None: + assert pattern.match(f"{exception}") + if self.attributes is not None: + for key, value in self.attributes.items(): + logging.debug("checking exception attribute %s=%r", key, value) + assert hasattr(exception, key) + assert getattr(exception, key) == value + except Exception: + logging.error("problem checking exception", exc_info=exception) + raise diff --git a/test/utils/http.py b/test/utils/http.py index fa13a2ed..e40d2a8c 100644 --- a/test/utils/http.py +++ b/test/utils/http.py @@ -108,3 +108,12 @@ def ctx_http_server(server: HTTPServerT) -> Iterator[HTTPServerT]: server.shutdown() server.socket.close() server_thread.join() + + +def headers_as_message(headers: HeadersT) -> email.message.Message: + message = email.message.Message() + for header, value in header_items(headers): + # This will append the value to any existing values for the header + # instead of replacing it. + message[header] = value + return message diff --git a/test/utils/httpfileserver.py b/test/utils/httpfileserver.py index 1989070a..49c92e80 100644 --- a/test/utils/httpfileserver.py +++ b/test/utils/httpfileserver.py @@ -74,7 +74,7 @@ class HTTPFileInfo: :param effective_url: The URL that the file will be served from after redirects. :param redirects: A sequence of redirects that will be given to the client - if it uses the ``request_url``. This sequence will terimate in the + if it uses the ``request_url``. This sequence will terminate in the ``effective_url``. """ @@ -128,15 +128,17 @@ class HTTPFileServer(HTTPServer): self, proto_file: ProtoFileResource, proto_redirects: Optional[Sequence[ProtoRedirectResource]] = None, + suffix: str = "", ) -> HTTPFileInfo: - return self.add_file(proto_file, proto_redirects) + return self.add_file(proto_file, proto_redirects, suffix) def add_file( self, proto_file: ProtoFileResource, proto_redirects: Optional[Sequence[ProtoRedirectResource]] = None, + suffix: str = "", ) -> HTTPFileInfo: - url_path = f"/file/{uuid4().hex}" + url_path = f"/file/{uuid4().hex}{suffix}" url = urljoin(self.url, url_path) file_resource = FileResource( url_path=url_path, @@ -151,7 +153,7 @@ class HTTPFileServer(HTTPServer): redirects: List[RedirectResource] = [] for proto_redirect in reversed(proto_redirects): - redirect_url_path = f"/redirect/{uuid4().hex}" + redirect_url_path = f"/redirect/{uuid4().hex}{suffix}" if proto_redirect.location_type == LocationType.URL: location = url elif proto_redirect.location_type == LocationType.ABSOLUTE_PATH: -- cgit v1.2.1