summaryrefslogtreecommitdiff
path: root/rdflib/util.py
diff options
context:
space:
mode:
authorGraham Higgins <gjhiggins@users.noreply.github.com>2022-05-19 18:28:33 +0100
committerGitHub <noreply@github.com>2022-05-19 19:28:33 +0200
commit32923ce4ea53731ece3800890a0b65c863470013 (patch)
treec5ca9aebba8775064146f7448e5b9f6b32403e7f /rdflib/util.py
parent10f33ee43433093069b18fb71c25021d62bfafe5 (diff)
downloadrdflib-32923ce4ea53731ece3800890a0b65c863470013.tar.gz
Fixes #1429, add `iri2uri` (#1902)
Add an iri-to-uri conversion utility to encode IRIs to URIs for `Graph.parse()` sources. Added a couple of tests because feeding it with a suite of IRIs to check seems overkill (not that I could find one). Fixes #1429 Co-authored-by: Iwan Aucamp <aucampia@gmail.com>
Diffstat (limited to 'rdflib/util.py')
-rw-r--r--rdflib/util.py35
1 files changed, 35 insertions, 0 deletions
diff --git a/rdflib/util.py b/rdflib/util.py
index 246b5cc8..b73a9594 100644
--- a/rdflib/util.py
+++ b/rdflib/util.py
@@ -36,6 +36,7 @@ from typing import (
Tuple,
TypeVar,
)
+from urllib.parse import quote, urlsplit, urlunsplit
import rdflib.graph # avoid circular dependency
from rdflib.compat import sign
@@ -58,6 +59,7 @@ __all__ = [
"find_roots",
"get_tree",
"_coalesce",
+ "_iri2uri",
]
@@ -476,3 +478,36 @@ def _coalesce(*args: Optional[_AnyT]) -> Optional[_AnyT]:
if arg is not None:
return arg
return None
+
+
+def _iri2uri(iri: str) -> str:
+ """
+ Convert an IRI to a URI (Python 3).
+ https://stackoverflow.com/a/42309027
+ https://stackoverflow.com/a/40654295
+ netloc should be encoded using IDNA;
+ non-ascii URL path should be encoded to UTF-8 and then percent-escaped;
+ non-ascii query parameters should be encoded to the encoding of a page
+ URL was extracted from (or to the encoding server uses), then
+ percent-escaped.
+ >>> _iri2uri("https://dbpedia.org/resource/Almería")
+ 'https://dbpedia.org/resource/Almer%C3%ADa'
+ """
+
+ (scheme, netloc, path, query, fragment) = urlsplit(iri)
+
+ # Just support http/https, otherwise return the iri unmolested
+ if scheme not in ["http", "https"]:
+ return iri
+
+ scheme = quote(scheme)
+ netloc = quote(netloc.encode("idna").decode("utf-8"))
+ path = quote(path)
+ query = quote(query)
+ fragment = quote(fragment)
+ uri = urlunsplit((scheme, netloc, path, query, fragment))
+
+ if iri.endswith("#") and not uri.endswith("#"):
+ uri += "#"
+
+ return uri