diff options
-rw-r--r-- | rdflib/util.py | 90 | ||||
-rw-r--r-- | test/test_graph/test_graph_http.py | 43 | ||||
-rw-r--r-- | test/test_util.py | 18 | ||||
-rw-r--r-- | test/utils/http.py | 9 | ||||
-rw-r--r-- | test/utils/wildcard.py | 28 |
5 files changed, 169 insertions, 19 deletions
diff --git a/rdflib/util.py b/rdflib/util.py index 4485de2e..2442b372 100644 --- a/rdflib/util.py +++ b/rdflib/util.py @@ -522,32 +522,92 @@ def _coalesce( return default +_RFC3986_SUBDELIMS = "!$&'()*+,;=" +""" +``sub-delims`` production from `RFC 3986, section 2.2 +<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_. +""" + +_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@" +""" +The non-unreserved characters in the ``pchar`` production from RFC 3986. +""" + +_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?" +""" +The non-unreserved characters that are safe to use in in the query and fragment +components. + +.. code-block:: + + pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query + = *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" ) +""" + +_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%" +""" +The non-unreserved characters that are safe to use in the username and password +components. + +.. code-block:: + + userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + +":" is excluded as this is only used for the username and password components, +and they are treated separately. +""" + +_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/" +""" +The non-unreserved characters that are safe to use in the path component. + + +This is based on various path-related productions from RFC 3986. +""" + + def _iri2uri(iri: str) -> str: """ - Convert an IRI to a URI (Python 3). - https://stackoverflow.com/a/42309027 - https://stackoverflow.com/a/40654295 - netloc should be encoded using IDNA; - non-ascii URL path should be encoded to UTF-8 and then percent-escaped; - non-ascii query parameters should be encoded to the encoding of a page - URL was extracted from (or to the encoding server uses), then - percent-escaped. + Prior art: + + * `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_ + >>> _iri2uri("https://dbpedia.org/resource/Almería") 'https://dbpedia.org/resource/Almer%C3%ADa' """ + # https://datatracker.ietf.org/doc/html/rfc3986 # https://datatracker.ietf.org/doc/html/rfc3305 - (scheme, netloc, path, query, fragment) = urlsplit(iri) + parts = urlsplit(iri) + (scheme, netloc, path, query, fragment) = parts - # Just support http/https, otherwise return the iri unmolested + # Just support http/https, otherwise return the iri unaltered if scheme not in ["http", "https"]: return iri - scheme = quote(scheme) - netloc = netloc.encode("idna").decode("utf-8") - path = quote(path) - query = quote(query) - fragment = quote(fragment) + path = quote(path, safe=_PATH_SAFE_CHARS) + query = quote(query, safe=_QUERY_SAFE_CHARS) + fragment = quote(fragment, safe=_QUERY_SAFE_CHARS) + + if parts.hostname: + netloc = parts.hostname.encode("idna").decode("ascii") + else: + netloc = "" + + if ":" in netloc: + # Quote IPv6 addresses + netloc = f"[{netloc}]" + + if parts.port: + netloc = f"{netloc}:{parts.port}" + + if parts.username: + auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS) + if parts.password: + pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS) + auth = f"{auth}:{pass_quoted}" + netloc = f"{auth}@{netloc}" + uri = urlunsplit((scheme, netloc, path, query, fragment)) if iri.endswith("#") and not uri.endswith("#"): diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py index 762e3d5b..97c64c3a 100644 --- a/test/test_graph/test_graph_http.py +++ b/test/test_graph/test_graph_http.py @@ -1,14 +1,20 @@ +import logging import re from http.server import BaseHTTPRequestHandler from test.data import TEST_DATA_DIR from test.utils import GraphHelper from test.utils.graph import cached_graph -from test.utils.http import ctx_http_handler +from test.utils.http import ( + MOCK_HTTP_REQUEST_WILDCARD, + MockHTTPRequest, + ctx_http_handler, +) from test.utils.httpservermock import ( MethodName, MockHTTPResponse, ServedBaseHTTPServerMock, ) +from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD from urllib.error import HTTPError import pytest @@ -235,7 +241,34 @@ class TestGraphHTTP: assert raised.value.code == 500 -def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None: +@pytest.mark.parametrize( + ["url_suffix", "expected_request"], + [ + ( + "/resource/Almería", + MOCK_HTTP_REQUEST_WILDCARD._replace( + path="/resource/Almer%C3%ADa", + parsed_path=URL_PARSE_RESULT_WILDCARD._replace( + path="/resource/Almer%C3%ADa" + ), + ), + ), + ( + "/resource/Almería?foo=bar", + MOCK_HTTP_REQUEST_WILDCARD._replace( + parsed_path=URL_PARSE_RESULT_WILDCARD._replace( + path="/resource/Almer%C3%ADa" + ), + path_query={"foo": ["bar"]}, + ), + ), + ], +) +def test_iri_source( + url_suffix: str, + expected_request: MockHTTPRequest, + function_httpmock: ServedBaseHTTPServerMock, +) -> None: diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl" function_httpmock.responses[MethodName.GET].append( @@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None: ) ) g = Graph() - g.parse(f"{function_httpmock.url}/resource/Almería") + g.parse(f"{function_httpmock.url}{url_suffix}") assert function_httpmock.call_count == 1 GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g) + assert len(g) > 1 req = function_httpmock.requests[MethodName.GET].pop(0) - assert req.path == "/resource/Almer%C3%ADa" + logging.debug("req = %s", req) + assert expected_request == req diff --git a/test/test_util.py b/test/test_util.py index 3e60bbb8..c842bc92 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -635,6 +635,24 @@ def test_get_tree( "http://example.com:1231/", }, ), + ( + "http://example.com:1231/a=b", + { + "http://example.com:1231/a=b", + }, + ), + ( + "http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d", + { + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + }, + ), + ( + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + { + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + }, + ), ], ) def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None: diff --git a/test/utils/http.py b/test/utils/http.py index af72e015..fa13a2ed 100644 --- a/test/utils/http.py +++ b/test/utils/http.py @@ -4,6 +4,7 @@ import enum import random from contextlib import contextmanager from http.server import BaseHTTPRequestHandler, HTTPServer +from test.utils.wildcard import EQ_WILDCARD from threading import Thread from typing import ( Dict, @@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple): body: Optional[bytes] +MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest( + EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD +) +""" +This object should be equal to any `MockHTTPRequest` object. +""" + + class MockHTTPResponse(NamedTuple): status_code: int reason_phrase: str diff --git a/test/utils/wildcard.py b/test/utils/wildcard.py new file mode 100644 index 00000000..7444a24b --- /dev/null +++ b/test/utils/wildcard.py @@ -0,0 +1,28 @@ +from typing import Any +from urllib.parse import ParseResult + + +class EqWildcard: + """ + An object that matches anything. + """ + + def __eq__(self, other: Any) -> Any: + return True + + def __req__(self, other: Any) -> Any: + return True + + def __repr__(self) -> str: + return "EqWildcard()" + + +EQ_WILDCARD: Any = EqWildcard() + + +URL_PARSE_RESULT_WILDCARD = ParseResult( + EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD +) +""" +This should be equal to any `ParseResult` object. +""" |