diff options
author | Graham Higgins <gjhiggins@users.noreply.github.com> | 2022-05-19 18:28:33 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-19 19:28:33 +0200 |
commit | 32923ce4ea53731ece3800890a0b65c863470013 (patch) | |
tree | c5ca9aebba8775064146f7448e5b9f6b32403e7f /rdflib/util.py | |
parent | 10f33ee43433093069b18fb71c25021d62bfafe5 (diff) | |
download | rdflib-32923ce4ea53731ece3800890a0b65c863470013.tar.gz |
Fixes #1429, add `iri2uri` (#1902)
Add an iri-to-uri conversion utility to encode IRIs to URIs for `Graph.parse()` sources. Added a couple of tests because feeding it with a suite of IRIs to check seems overkill (not that I could find one).
Fixes #1429
Co-authored-by: Iwan Aucamp <aucampia@gmail.com>
Diffstat (limited to 'rdflib/util.py')
-rw-r--r-- | rdflib/util.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/rdflib/util.py b/rdflib/util.py index 246b5cc8..b73a9594 100644 --- a/rdflib/util.py +++ b/rdflib/util.py @@ -36,6 +36,7 @@ from typing import ( Tuple, TypeVar, ) +from urllib.parse import quote, urlsplit, urlunsplit import rdflib.graph # avoid circular dependency from rdflib.compat import sign @@ -58,6 +59,7 @@ __all__ = [ "find_roots", "get_tree", "_coalesce", + "_iri2uri", ] @@ -476,3 +478,36 @@ def _coalesce(*args: Optional[_AnyT]) -> Optional[_AnyT]: if arg is not None: return arg return None + + +def _iri2uri(iri: str) -> str: + """ + Convert an IRI to a URI (Python 3). + https://stackoverflow.com/a/42309027 + https://stackoverflow.com/a/40654295 + netloc should be encoded using IDNA; + non-ascii URL path should be encoded to UTF-8 and then percent-escaped; + non-ascii query parameters should be encoded to the encoding of a page + URL was extracted from (or to the encoding server uses), then + percent-escaped. + >>> _iri2uri("https://dbpedia.org/resource/AlmerÃa") + 'https://dbpedia.org/resource/Almer%C3%ADa' + """ + + (scheme, netloc, path, query, fragment) = urlsplit(iri) + + # Just support http/https, otherwise return the iri unmolested + if scheme not in ["http", "https"]: + return iri + + scheme = quote(scheme) + netloc = quote(netloc.encode("idna").decode("utf-8")) + path = quote(path) + query = quote(query) + fragment = quote(fragment) + uri = urlunsplit((scheme, netloc, path, query, fragment)) + + if iri.endswith("#") and not uri.endswith("#"): + uri += "#" + + return uri |