summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIwan Aucamp <aucampia@gmail.com>2023-03-23 18:48:14 +0100
committerGitHub <noreply@github.com>2023-03-23 18:48:14 +0100
commitdfa40545adc3e701bc36d2f8cc4dba1d81a906c4 (patch)
treec743669c27caa86eb760b63ae85e83f8e472b4c9
parentcfe6e378e6b0aff106f6baf3b5d82adbeb547236 (diff)
downloadrdflib-dfa40545adc3e701bc36d2f8cc4dba1d81a906c4.tar.gz
fix: IRI to URI conversion (#2304)
The URI to IRI conversion was percentage-quoting characters that should not have been quoted, like equals in the query string. It was also not quoting things that should have been quoted, like the username and password components of a URI. This change improves the conversion by only quoting characters that are not allowed in specific parts of the URI and quoting previously unquoted components. The safe characters for each segment are taken from [RFC3986](https://datatracker.ietf.org/doc/html/rfc3986). The new behavior is heavily inspired by [`werkzeug.urls.iri_to_uri`](https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931) though there are some differences. - Closes <https://github.com/RDFLib/rdflib/issues/2120>.
-rw-r--r--rdflib/util.py90
-rw-r--r--test/test_graph/test_graph_http.py43
-rw-r--r--test/test_util.py18
-rw-r--r--test/utils/http.py9
-rw-r--r--test/utils/wildcard.py28
5 files changed, 169 insertions, 19 deletions
diff --git a/rdflib/util.py b/rdflib/util.py
index 4485de2e..2442b372 100644
--- a/rdflib/util.py
+++ b/rdflib/util.py
@@ -522,32 +522,92 @@ def _coalesce(
return default
+_RFC3986_SUBDELIMS = "!$&'()*+,;="
+"""
+``sub-delims`` production from `RFC 3986, section 2.2
+<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_.
+"""
+
+_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"
+"""
+The non-unreserved characters in the ``pchar`` production from RFC 3986.
+"""
+
+_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"
+"""
+The non-unreserved characters that are safe to use in in the query and fragment
+components.
+
+.. code-block::
+
+ pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query
+ = *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )
+"""
+
+_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"
+"""
+The non-unreserved characters that are safe to use in the username and password
+components.
+
+.. code-block::
+
+ userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+
+":" is excluded as this is only used for the username and password components,
+and they are treated separately.
+"""
+
+_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"
+"""
+The non-unreserved characters that are safe to use in the path component.
+
+
+This is based on various path-related productions from RFC 3986.
+"""
+
+
def _iri2uri(iri: str) -> str:
"""
- Convert an IRI to a URI (Python 3).
- https://stackoverflow.com/a/42309027
- https://stackoverflow.com/a/40654295
- netloc should be encoded using IDNA;
- non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
- non-ascii query parameters should be encoded to the encoding of a page
- URL was extracted from (or to the encoding server uses), then
- percent-escaped.
+ Prior art:
+
+ * `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_
+
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""
+ # https://datatracker.ietf.org/doc/html/rfc3986
# https://datatracker.ietf.org/doc/html/rfc3305
- (scheme, netloc, path, query, fragment) = urlsplit(iri)
+ parts = urlsplit(iri)
+ (scheme, netloc, path, query, fragment) = parts
- # Just support http/https, otherwise return the iri unmolested
+ # Just support http/https, otherwise return the iri unaltered
if scheme not in ["http", "https"]:
return iri
- scheme = quote(scheme)
- netloc = netloc.encode("idna").decode("utf-8")
- path = quote(path)
- query = quote(query)
- fragment = quote(fragment)
+ path = quote(path, safe=_PATH_SAFE_CHARS)
+ query = quote(query, safe=_QUERY_SAFE_CHARS)
+ fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)
+
+ if parts.hostname:
+ netloc = parts.hostname.encode("idna").decode("ascii")
+ else:
+ netloc = ""
+
+ if ":" in netloc:
+ # Quote IPv6 addresses
+ netloc = f"[{netloc}]"
+
+ if parts.port:
+ netloc = f"{netloc}:{parts.port}"
+
+ if parts.username:
+ auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)
+ if parts.password:
+ pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)
+ auth = f"{auth}:{pass_quoted}"
+ netloc = f"{auth}@{netloc}"
+
uri = urlunsplit((scheme, netloc, path, query, fragment))
if iri.endswith("#") and not uri.endswith("#"):
diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py
index 762e3d5b..97c64c3a 100644
--- a/test/test_graph/test_graph_http.py
+++ b/test/test_graph/test_graph_http.py
@@ -1,14 +1,20 @@
+import logging
import re
from http.server import BaseHTTPRequestHandler
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
-from test.utils.http import ctx_http_handler
+from test.utils.http import (
+ MOCK_HTTP_REQUEST_WILDCARD,
+ MockHTTPRequest,
+ ctx_http_handler,
+)
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
)
+from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD
from urllib.error import HTTPError
import pytest
@@ -235,7 +241,34 @@ class TestGraphHTTP:
assert raised.value.code == 500
-def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
+@pytest.mark.parametrize(
+ ["url_suffix", "expected_request"],
+ [
+ (
+ "/resource/Almería",
+ MOCK_HTTP_REQUEST_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa",
+ parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa"
+ ),
+ ),
+ ),
+ (
+ "/resource/Almería?foo=bar",
+ MOCK_HTTP_REQUEST_WILDCARD._replace(
+ parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa"
+ ),
+ path_query={"foo": ["bar"]},
+ ),
+ ),
+ ],
+)
+def test_iri_source(
+ url_suffix: str,
+ expected_request: MockHTTPRequest,
+ function_httpmock: ServedBaseHTTPServerMock,
+) -> None:
diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl"
function_httpmock.responses[MethodName.GET].append(
@@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
)
)
g = Graph()
- g.parse(f"{function_httpmock.url}/resource/Almería")
+ g.parse(f"{function_httpmock.url}{url_suffix}")
assert function_httpmock.call_count == 1
GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g)
+ assert len(g) > 1
req = function_httpmock.requests[MethodName.GET].pop(0)
- assert req.path == "/resource/Almer%C3%ADa"
+ logging.debug("req = %s", req)
+ assert expected_request == req
diff --git a/test/test_util.py b/test/test_util.py
index 3e60bbb8..c842bc92 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -635,6 +635,24 @@ def test_get_tree(
"http://example.com:1231/",
},
),
+ (
+ "http://example.com:1231/a=b",
+ {
+ "http://example.com:1231/a=b",
+ },
+ ),
+ (
+ "http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d",
+ {
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ },
+ ),
+ (
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ {
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ },
+ ),
],
)
def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None:
diff --git a/test/utils/http.py b/test/utils/http.py
index af72e015..fa13a2ed 100644
--- a/test/utils/http.py
+++ b/test/utils/http.py
@@ -4,6 +4,7 @@ import enum
import random
from contextlib import contextmanager
from http.server import BaseHTTPRequestHandler, HTTPServer
+from test.utils.wildcard import EQ_WILDCARD
from threading import Thread
from typing import (
Dict,
@@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple):
body: Optional[bytes]
+MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest(
+ EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
+)
+"""
+This object should be equal to any `MockHTTPRequest` object.
+"""
+
+
class MockHTTPResponse(NamedTuple):
status_code: int
reason_phrase: str
diff --git a/test/utils/wildcard.py b/test/utils/wildcard.py
new file mode 100644
index 00000000..7444a24b
--- /dev/null
+++ b/test/utils/wildcard.py
@@ -0,0 +1,28 @@
+from typing import Any
+from urllib.parse import ParseResult
+
+
+class EqWildcard:
+ """
+ An object that matches anything.
+ """
+
+ def __eq__(self, other: Any) -> Any:
+ return True
+
+ def __req__(self, other: Any) -> Any:
+ return True
+
+ def __repr__(self) -> str:
+ return "EqWildcard()"
+
+
+EQ_WILDCARD: Any = EqWildcard()
+
+
+URL_PARSE_RESULT_WILDCARD = ParseResult(
+ EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
+)
+"""
+This should be equal to any `ParseResult` object.
+"""