summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--rdflib/util.py90
-rw-r--r--test/test_graph/test_graph_http.py43
-rw-r--r--test/test_util.py18
-rw-r--r--test/utils/http.py9
-rw-r--r--test/utils/wildcard.py28
5 files changed, 169 insertions, 19 deletions
diff --git a/rdflib/util.py b/rdflib/util.py
index 4485de2e..2442b372 100644
--- a/rdflib/util.py
+++ b/rdflib/util.py
@@ -522,32 +522,92 @@ def _coalesce(
return default
+_RFC3986_SUBDELIMS = "!$&'()*+,;="
+"""
+``sub-delims`` production from `RFC 3986, section 2.2
+<https://www.rfc-editor.org/rfc/rfc3986.html#section-2.2>`_.
+"""
+
+_RFC3986_PCHAR_NU = "%" + _RFC3986_SUBDELIMS + ":@"
+"""
+The non-unreserved characters in the ``pchar`` production from RFC 3986.
+"""
+
+_QUERY_SAFE_CHARS = _RFC3986_PCHAR_NU + "/?"
+"""
+The non-unreserved characters that are safe to use in in the query and fragment
+components.
+
+.. code-block::
+
+ pchar = unreserved / pct-encoded / sub-delims / ":" / "@" query
+ = *( pchar / "/" / "?" ) fragment = *( pchar / "/" / "?" )
+"""
+
+_USERNAME_SAFE_CHARS = _RFC3986_SUBDELIMS + "%"
+"""
+The non-unreserved characters that are safe to use in the username and password
+components.
+
+.. code-block::
+
+ userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+
+":" is excluded as this is only used for the username and password components,
+and they are treated separately.
+"""
+
+_PATH_SAFE_CHARS = _RFC3986_PCHAR_NU + "/"
+"""
+The non-unreserved characters that are safe to use in the path component.
+
+
+This is based on various path-related productions from RFC 3986.
+"""
+
+
def _iri2uri(iri: str) -> str:
"""
- Convert an IRI to a URI (Python 3).
- https://stackoverflow.com/a/42309027
- https://stackoverflow.com/a/40654295
- netloc should be encoded using IDNA;
- non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
- non-ascii query parameters should be encoded to the encoding of a page
- URL was extracted from (or to the encoding server uses), then
- percent-escaped.
+ Prior art:
+
+ * `iri_to_uri from Werkzeug <https://github.com/pallets/werkzeug/blob/92c6380248c7272ee668e1f8bbd80447027ccce2/src/werkzeug/urls.py#L926-L931>`_
+
>>> _iri2uri("https://dbpedia.org/resource/Almería")
'https://dbpedia.org/resource/Almer%C3%ADa'
"""
+ # https://datatracker.ietf.org/doc/html/rfc3986
# https://datatracker.ietf.org/doc/html/rfc3305
- (scheme, netloc, path, query, fragment) = urlsplit(iri)
+ parts = urlsplit(iri)
+ (scheme, netloc, path, query, fragment) = parts
- # Just support http/https, otherwise return the iri unmolested
+ # Just support http/https, otherwise return the iri unaltered
if scheme not in ["http", "https"]:
return iri
- scheme = quote(scheme)
- netloc = netloc.encode("idna").decode("utf-8")
- path = quote(path)
- query = quote(query)
- fragment = quote(fragment)
+ path = quote(path, safe=_PATH_SAFE_CHARS)
+ query = quote(query, safe=_QUERY_SAFE_CHARS)
+ fragment = quote(fragment, safe=_QUERY_SAFE_CHARS)
+
+ if parts.hostname:
+ netloc = parts.hostname.encode("idna").decode("ascii")
+ else:
+ netloc = ""
+
+ if ":" in netloc:
+ # Quote IPv6 addresses
+ netloc = f"[{netloc}]"
+
+ if parts.port:
+ netloc = f"{netloc}:{parts.port}"
+
+ if parts.username:
+ auth = quote(parts.username, safe=_USERNAME_SAFE_CHARS)
+ if parts.password:
+ pass_quoted = quote(parts.password, safe=_USERNAME_SAFE_CHARS)
+ auth = f"{auth}:{pass_quoted}"
+ netloc = f"{auth}@{netloc}"
+
uri = urlunsplit((scheme, netloc, path, query, fragment))
if iri.endswith("#") and not uri.endswith("#"):
diff --git a/test/test_graph/test_graph_http.py b/test/test_graph/test_graph_http.py
index 762e3d5b..97c64c3a 100644
--- a/test/test_graph/test_graph_http.py
+++ b/test/test_graph/test_graph_http.py
@@ -1,14 +1,20 @@
+import logging
import re
from http.server import BaseHTTPRequestHandler
from test.data import TEST_DATA_DIR
from test.utils import GraphHelper
from test.utils.graph import cached_graph
-from test.utils.http import ctx_http_handler
+from test.utils.http import (
+ MOCK_HTTP_REQUEST_WILDCARD,
+ MockHTTPRequest,
+ ctx_http_handler,
+)
from test.utils.httpservermock import (
MethodName,
MockHTTPResponse,
ServedBaseHTTPServerMock,
)
+from test.utils.wildcard import URL_PARSE_RESULT_WILDCARD
from urllib.error import HTTPError
import pytest
@@ -235,7 +241,34 @@ class TestGraphHTTP:
assert raised.value.code == 500
-def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
+@pytest.mark.parametrize(
+ ["url_suffix", "expected_request"],
+ [
+ (
+ "/resource/Almería",
+ MOCK_HTTP_REQUEST_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa",
+ parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa"
+ ),
+ ),
+ ),
+ (
+ "/resource/Almería?foo=bar",
+ MOCK_HTTP_REQUEST_WILDCARD._replace(
+ parsed_path=URL_PARSE_RESULT_WILDCARD._replace(
+ path="/resource/Almer%C3%ADa"
+ ),
+ path_query={"foo": ["bar"]},
+ ),
+ ),
+ ],
+)
+def test_iri_source(
+ url_suffix: str,
+ expected_request: MockHTTPRequest,
+ function_httpmock: ServedBaseHTTPServerMock,
+) -> None:
diverse_triples_path = TEST_DATA_DIR / "variants/diverse_triples.ttl"
function_httpmock.responses[MethodName.GET].append(
@@ -247,9 +280,11 @@ def test_iri_source(function_httpmock: ServedBaseHTTPServerMock) -> None:
)
)
g = Graph()
- g.parse(f"{function_httpmock.url}/resource/Almería")
+ g.parse(f"{function_httpmock.url}{url_suffix}")
assert function_httpmock.call_count == 1
GraphHelper.assert_triple_sets_equals(cached_graph((diverse_triples_path,)), g)
+ assert len(g) > 1
req = function_httpmock.requests[MethodName.GET].pop(0)
- assert req.path == "/resource/Almer%C3%ADa"
+ logging.debug("req = %s", req)
+ assert expected_request == req
diff --git a/test/test_util.py b/test/test_util.py
index 3e60bbb8..c842bc92 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -635,6 +635,24 @@ def test_get_tree(
"http://example.com:1231/",
},
),
+ (
+ "http://example.com:1231/a=b",
+ {
+ "http://example.com:1231/a=b",
+ },
+ ),
+ (
+ "http://aé:aé@example.com:1231/bé/a=bé&c=d#a=bé&c=d",
+ {
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ },
+ ),
+ (
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ {
+ "http://a%C3%A9:a%C3%A9@example.com:1231/b%C3%A9/a=b%C3%A9&c=d#a=b%C3%A9&c=d",
+ },
+ ),
],
)
def test_iri2uri(iri: str, expected_result: Union[Set[str], Type[Exception]]) -> None:
diff --git a/test/utils/http.py b/test/utils/http.py
index af72e015..fa13a2ed 100644
--- a/test/utils/http.py
+++ b/test/utils/http.py
@@ -4,6 +4,7 @@ import enum
import random
from contextlib import contextmanager
from http.server import BaseHTTPRequestHandler, HTTPServer
+from test.utils.wildcard import EQ_WILDCARD
from threading import Thread
from typing import (
Dict,
@@ -62,6 +63,14 @@ class MockHTTPRequest(NamedTuple):
body: Optional[bytes]
+MOCK_HTTP_REQUEST_WILDCARD = MockHTTPRequest(
+ EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
+)
+"""
+This object should be equal to any `MockHTTPRequest` object.
+"""
+
+
class MockHTTPResponse(NamedTuple):
status_code: int
reason_phrase: str
diff --git a/test/utils/wildcard.py b/test/utils/wildcard.py
new file mode 100644
index 00000000..7444a24b
--- /dev/null
+++ b/test/utils/wildcard.py
@@ -0,0 +1,28 @@
+from typing import Any
+from urllib.parse import ParseResult
+
+
+class EqWildcard:
+ """
+ An object that matches anything.
+ """
+
+ def __eq__(self, other: Any) -> Any:
+ return True
+
+ def __req__(self, other: Any) -> Any:
+ return True
+
+ def __repr__(self) -> str:
+ return "EqWildcard()"
+
+
+EQ_WILDCARD: Any = EqWildcard()
+
+
+URL_PARSE_RESULT_WILDCARD = ParseResult(
+ EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD, EQ_WILDCARD
+)
+"""
+This should be equal to any `ParseResult` object.
+"""