summaryrefslogtreecommitdiff
path: root/rdflib/parser.py
diff options
context:
space:
mode:
authorAshley Sommer <ashleysommer@gmail.com>2020-08-19 12:16:06 +1000
committerAshley Sommer <ashleysommer@gmail.com>2020-08-19 12:16:06 +1000
commitceab6b2f71f97e45007b5306d2e0416bfefcea75 (patch)
tree96e350185347e048382a94da29c3ad04c0d7164f /rdflib/parser.py
parent89cb369113a28a24d899e0a79123fd51fe6a8119 (diff)
downloadrdflib-ceab6b2f71f97e45007b5306d2e0416bfefcea75.tar.gz
Merged two Ntriples parser files
Changed name of NTriplesParser to W3CNTriplesParser, it is the legacy parser Populate CharacterStream attr on several types of rdflib InputSource, to provide unicode text stream, in addition to ByteStream Add support to N3, Trig, NTriples, NQuads parsers to use the CharacterStream instead of the ByteStream where possible Reduces many useless string->bytes->string conversions in parsers.
Diffstat (limited to 'rdflib/parser.py')
-rw-r--r--rdflib/parser.py104
1 files changed, 85 insertions, 19 deletions
diff --git a/rdflib/parser.py b/rdflib/parser.py
index 9e501c03..73ce2ba7 100644
--- a/rdflib/parser.py
+++ b/rdflib/parser.py
@@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely
want to do so through the Graph class parse method.
"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import codecs
import os
import sys
-from io import BytesIO
-
+from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase
from urllib.request import pathname2url
from urllib.request import Request
@@ -41,6 +37,8 @@ __all__ = [
class Parser(object):
+ __slots__ = set()
+
def __init__(self):
pass
@@ -48,6 +46,37 @@ class Parser(object):
pass
+class BytesIOWrapper(BufferedIOBase):
+ __slots__ = ("wrapped", "encoded", "encoding")
+
+ def __init__(self, wrapped: str, encoding="utf-8"):
+ super(BytesIOWrapper, self).__init__()
+ self.wrapped = wrapped
+ self.encoding = encoding
+ self.encoded = None
+
+ def read(self, *args, **kwargs):
+ if self.encoded is None:
+ b, blen = codecs.getencoder(self.encoding)(self.wrapped)
+ self.encoded = BytesIO(b)
+ return self.encoded.read(*args, **kwargs)
+
+ def read1(self, *args, **kwargs):
+ if self.encoded is None:
+ b = codecs.getencoder(self.encoding)(self.wrapped)
+ self.encoded = BytesIO(b)
+ return self.encoded.read1(*args, **kwargs)
+
+ def readinto(self, *args, **kwargs):
+ raise NotImplementedError()
+
+ def readinto1(self, *args, **kwargs):
+ raise NotImplementedError()
+
+ def write(self, *args, **kwargs):
+ raise NotImplementedError()
+
+
class InputSource(xmlreader.InputSource, object):
"""
TODO:
@@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object):
self.auto_close = False # see Graph.parse(), true if opened by us
def close(self):
+ c = self.getCharacterStream()
+ if c and hasattr(c, "close"):
+ try:
+ c.close()
+ except Exception:
+ pass
f = self.getByteStream()
if f and hasattr(f, "close"):
- f.close()
+ try:
+ f.close()
+ except Exception:
+ pass
class StringInputSource(InputSource):
"""
- TODO:
+ Constructs an RDFLib Parser InputSource from a Python String or Bytes
"""
- def __init__(self, value, system_id=None):
+ def __init__(self, value, encoding="utf-8", system_id=None):
super(StringInputSource, self).__init__(system_id)
- stream = BytesIO(value)
- self.setByteStream(stream)
- # TODO:
- # encoding = value.encoding
- # self.setEncoding(encoding)
+ if isinstance(value, str):
+ stream = StringIO(value)
+ self.setCharacterStream(stream)
+ self.setEncoding(encoding)
+ b_stream = BytesIOWrapper(value, encoding)
+ self.setByteStream(b_stream)
+ else:
+ stream = BytesIO(value)
+ self.setByteStream(stream)
+ c_stream = TextIOWrapper(stream, encoding)
+ self.setCharacterStream(c_stream)
+ self.setEncoding(c_stream.encoding)
headers = {
@@ -134,8 +179,18 @@ class FileInputSource(InputSource):
system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base)
super(FileInputSource, self).__init__(system_id)
self.file = file
- self.setByteStream(file)
- # TODO: self.setEncoding(encoding)
+ if isinstance(file, TextIOBase): # Python3 unicode fp
+ self.setCharacterStream(file)
+ self.setEncoding(file.encoding)
+ try:
+ b = file.buffer
+ self.setByteStream(b)
+ except (AttributeError, LookupError):
+ self.setByteStream(file)
+ else:
+ self.setByteStream(file)
+ # We cannot set characterStream here because
+ # we do not know the Raw Bytes File encoding.
def __repr__(self):
return repr(self.file)
@@ -171,10 +226,21 @@ def create_input_source(
else:
if isinstance(source, str):
location = source
+ elif isinstance(source, bytes):
+ data = source
elif hasattr(source, "read") and not isinstance(source, Namespace):
f = source
input_source = InputSource()
- input_source.setByteStream(f)
+ if hasattr(source, "encoding"):
+ input_source.setCharacterStream(source)
+ input_source.setEncoding(source.encoding)
+ try:
+ b = file.buffer
+ input_source.setByteStream(b)
+ except (AttributeError, LookupError):
+ input_source.setByteStream(source)
+ else:
+ input_source.setByteStream(f)
if f is sys.stdin:
input_source.setSystemId("file:///dev/stdin")
elif hasattr(f, "name"):
@@ -206,8 +272,8 @@ def create_input_source(
input_source = FileInputSource(file)
if data is not None:
- if isinstance(data, str):
- data = data.encode("utf-8")
+ if not isinstance(data, (str, bytes, bytearray)):
+ raise RuntimeError("parse data can only str, or bytes.")
input_source = StringInputSource(data)
auto_close = True