diff options
author | Iwan Aucamp <aucampia@gmail.com> | 2023-03-23 18:48:14 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-03-23 18:48:14 +0100 |
commit | dfa40545adc3e701bc36d2f8cc4dba1d81a906c4 (patch) | |
tree | c743669c27caa86eb760b63ae85e83f8e472b4c9 | |
parent | cfe6e378e6b0aff106f6baf3b5d82adbeb547236 (diff) | |
download | rdflib-dfa40545adc3e701bc36d2f8cc4dba1d81a906c4.tar.gz |
fix: IRI to URI conversion (#2304)
The URI to IRI conversion was percentage-quoting characters that should not have
been quoted, like equals in the query string. It was also not quoting things
that should have been quoted, like the username and password components of a
URI.
This change improves the conversion by only quoting characters that are not
allowed in specific parts of the URI and quoting previously unquoted components.
The safe characters for each segment are taken from
[RFC3986](https://datatracker.ietf.org/doc/html/rfc3986).
The new behavior is heavily inspired by
[`werkzeug.urls.iri_to_uri`](https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931)
though there are some differences.
- Closes <https://github.com/RDFLib/rdflib/issues/2120>.
-rw-r--r-- | rdflib/util.py | 90 | ||||
-rw-r--r-- | test/test_graph/test_graph_http.py | 43 | ||||
-rw-r--r-- | test/test_util.py | 18 | ||||
-rw-r--r-- | test/utils/http.py | 9 | ||||
-rw-r--r-- | test/utils/wildcard.py | 28 |
5 files changed, 169 insertions, 19 deletions
diff --git a/rdflib/util.py b/rdflib/util.py index 4485de2e..2442b372 100644 --- a/rdflib/util.py +++ b/rdflib/util.py @@ -522,32 +522,92 @@ def _coalesce( return default +_RFC3986_SUBDELIMS = "!$&'()*+,;=" +""" +``sub-delims`` production from `RFC 3986, section 2.2 +<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_. +""" + +_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@" +""" +The non-unreserved characters in the ``pchar`` production from RFC 3986. +""" + +_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?" +""" +The non-unreserved characters that are safe to use in in the query and fragment +components. + +.. code-block:: + + pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query + = *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" ) +""" + +_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%" +""" +The non-unreserved characters that are safe to use in the username and password +components. + +.. code-block:: + + userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + +":" is excluded as this is only used for the username and password components, +and they are treated separately. +""" + +_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/" +""" +The non-unreserved characters that are safe to use in the path component. + + +This is based on various path-related productions from RFC 3986. +""" + + def _iri2uri(iri: str) -> str: """ - Convert an IRI to a URI (Python 3). - https://stackoverflow.com/a/42309027 - https://stackoverflow.com/a/40654295 - netloc should be encoded using IDNA; - non-ascii URL path should be encoded to UTF-8 and then percent-escaped; - non-ascii query parameters should be encoded to the encoding of a page - URL was extracted from (or to the encoding server uses), then - percent-escaped. + Prior art: + + * `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_ + >>> _iri2uri("https://dbpedia.org/resource/Almería") 'https://dbpedia.org/resource/Almer%C3%ADa' """ + # https://datatracker.ietf.org/doc/html/rfc3986 # https://datatracker.ietf.org/doc/html/rfc3305 - (scheme, netloc, path, query, fragment) = urlsplit(iri) + parts = urlsplit(iri) + (scheme, netloc, path, query, fragment) = parts - # Just support http/https, otherwise return the iri unmolested + # Just support http/https, otherwise return the iri unaltered if scheme not in ["http", "https"]: return iri - scheme = quote(scheme) - netloc = netloc.encode("idna").decode("utf-8") - path = quote(path) - query = quote(query) - fragment = quote(fragment) + path = quote(path, safe=_PATH_SAFE_CHARS) + query = quote(query, safe=_QUERY_SAFE_CHARS) + fragment = quote(fragment, safe=_QUERY_SAFE_CHARS) + + if parts.hostname: + netloc = parts.hostname.encode("idna").decode("ascii") + else: + netloc = "" + + if ":" in netloc: + # Quote IPv6 addresses + netloc = f"[{netloc}]" + + if parts.port: + netloc = f"{netloc}:{parts.port}" + + if parts.username: + auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS) + if parts.password: + pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS) + auth = f"{auth}:{pass_quoted}" + netloc = f"{auth}@{netloc}" + uri = urlunsplit((scheme, netloc, path, query, fragment)) if iri.endswith("#") and not uri.endswith("#"): diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py index 762e3d5b..97c64c3a 100644 --- a/test/test_graph/test_graph_http.py +++ b/test/test_graph/test_graph_http.py @@ -1,14 +1,20 @@ +import logging import re from http.server import BaseHTTPRequestHandler from test.data import TEST_DATA_DIR from test.utils import GraphHelper from test.utils.graph import cached_graph -from test.utils.http import ctx_http_handler +from test.utils.http import ( + MOCK_HTTP_REQUEST_WILDCARD, + MockHTTPRequest, + ctx_http_handler, +) from test.utils.httpservermock import ( MethodName, MockHTTPResponse, ServedBaseHTTPServerMock, ) +from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD from urllib.error import HTTPError import pytest @@ -235,7 +241,34 @@ class TestGraphHTTP: assert raised.value.code == 500 -def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None: +@pytest.mark.parametrize( + ["url_suffix", "expected_request"], + [ + ( + "/resource/Almería", + MOCK_HTTP_REQUEST_WILDCARD._replace( + path="/resource/Almer%C3%ADa", + parsed_path=URL_PARSE_RESULT_WILDCARD._replace( + path="/resource/Almer%C3%ADa" + ), + ), + ), + ( + "/resource/Almería?foo=bar", + MOCK_HTTP_REQUEST_WILDCARD._replace( + parsed_path=URL_PARSE_RESULT_WILDCARD._replace( + path="/resource/Almer%C3%ADa" + ), + path_query={"foo": ["bar"]}, + ), + ), + ], +) +def test_iri_source( + url_suffix: str, + expected_request: MockHTTPRequest, + function_httpmock: ServedBaseHTTPServerMock, +) -> None: diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl" function_httpmock.responses[MethodName.GET].append( @@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None: ) ) g = Graph() - g.parse(f"{function_httpmock.url}/resource/Almería") + g.parse(f"{function_httpmock.url}{url_suffix}") assert function_httpmock.call_count == 1 GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g) + assert len(g) > 1 req = function_httpmock.requests[MethodName.GET].pop(0) - assert req.path == "/resource/Almer%C3%ADa" + logging.debug("req = %s", req) + assert expected_request == req diff --git a/test/test_util.py b/test/test_util.py index 3e60bbb8..c842bc92 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -635,6 +635,24 @@ def test_get_tree( "http://example.com:1231/", }, ), + ( + "http://example.com:1231/a=b", + { + "http://example.com:1231/a=b", + }, + ), + ( + "http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d", + { + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + }, + ), + ( + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + { + "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d", + }, + ), ], ) def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None: diff --git a/test/utils/http.py b/test/utils/http.py index af72e015..fa13a2ed 100644 --- a/test/utils/http.py +++ b/test/utils/http.py @@ -4,6 +4,7 @@ import enum import random from contextlib import contextmanager from http.server import BaseHTTPRequestHandler, HTTPServer +from test.utils.wildcard import EQ_WILDCARD from threading import Thread from typing import ( Dict, @@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple): body: Optional[bytes] +MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest( + EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD +) +""" +This object should be equal to any `MockHTTPRequest` object. +""" + + class MockHTTPResponse(NamedTuple): status_code: int reason_phrase: str diff --git a/test/utils/wildcard.py b/test/utils/wildcard.py new file mode 100644 index 00000000..7444a24b --- /dev/null +++ b/test/utils/wildcard.py @@ -0,0 +1,28 @@ +from typing import Any +from urllib.parse import ParseResult + + +class EqWildcard: + """ + An object that matches anything. + """ + + def __eq__(self, other: Any) -> Any: + return True + + def __req__(self, other: Any) -> Any: + return True + + def __repr__(self) -> str: + return "EqWildcard()" + + +EQ_WILDCARD: Any = EqWildcard() + + +URL_PARSE_RESULT_WILDCARD = ParseResult( + EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD +) +""" +This should be equal to any `ParseResult` object. +""" |