Merged two Ntriples parser files

Changed name of NTriplesParser to W3CNTriplesParser, it is the legacy parser Populate CharacterStream attr on several types of rdflib InputSource, to provide unicode text stream, in addition to ByteStream Add support to N3, Trig, NTriples, NQuads parsers to use the CharacterStream instead of the ByteStream where possible Reduces many useless string->bytes->string conversions in parsers.
author: Ashley Sommer <ashleysommer@gmail.com> 2020-08-19 12:16:06 +1000
committer: Ashley Sommer <ashleysommer@gmail.com> 2020-08-19 12:16:06 +1000
commit: ceab6b2f71f97e45007b5306d2e0416bfefcea75 (patch)
tree: 96e350185347e048382a94da29c3ad04c0d7164f /rdflib/parser.py
parent: 89cb369113a28a24d899e0a79123fd51fe6a8119 (diff)
download: rdflib-ceab6b2f71f97e45007b5306d2e0416bfefcea75.tar.gz
1 files changed, 85 insertions, 19 deletions
diff --git a/rdflib/parser.py b/rdflib/parser.py
index 9e501c03..73ce2ba7 100644
--- a/rdflib/parser.py
+++ b/rdflib/parser.py
@@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely
 want to do so through the Graph class parse method.
 
 """
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import codecs
 import os
 import sys
 
-from io import BytesIO
-
+from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase
 
 from urllib.request import pathname2url
 from urllib.request import Request
@@ -41,6 +37,8 @@ __all__ = [
 
 
 class Parser(object):
+    __slots__ = set()
+
     def __init__(self):
         pass
 
@@ -48,6 +46,37 @@ class Parser(object):
         pass
 
 
+class BytesIOWrapper(BufferedIOBase):
+    __slots__ = ("wrapped", "encoded", "encoding")
+
+    def __init__(self, wrapped: str, encoding="utf-8"):
+        super(BytesIOWrapper, self).__init__()
+        self.wrapped = wrapped
+        self.encoding = encoding
+        self.encoded = None
+
+    def read(self, *args, **kwargs):
+        if self.encoded is None:
+            b, blen = codecs.getencoder(self.encoding)(self.wrapped)
+            self.encoded = BytesIO(b)
+        return self.encoded.read(*args, **kwargs)
+
+    def read1(self, *args, **kwargs):
+        if self.encoded is None:
+            b = codecs.getencoder(self.encoding)(self.wrapped)
+            self.encoded = BytesIO(b)
+        return self.encoded.read1(*args, **kwargs)
+
+    def readinto(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def readinto1(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def write(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
 class InputSource(xmlreader.InputSource, object):
     """
     TODO:
@@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object):
         self.auto_close = False  # see Graph.parse(), true if opened by us
 
     def close(self):
+        c = self.getCharacterStream()
+        if c and hasattr(c, "close"):
+            try:
+                c.close()
+            except Exception:
+                pass
         f = self.getByteStream()
         if f and hasattr(f, "close"):
-            f.close()
+            try:
+                f.close()
+            except Exception:
+                pass
 
 
 class StringInputSource(InputSource):
     """
-    TODO:
+    Constructs an RDFLib Parser InputSource from a Python String or Bytes
     """
 
-    def __init__(self, value, system_id=None):
+    def __init__(self, value, encoding="utf-8", system_id=None):
         super(StringInputSource, self).__init__(system_id)
-        stream = BytesIO(value)
-        self.setByteStream(stream)
-        # TODO:
-        #   encoding = value.encoding
-        #   self.setEncoding(encoding)
+        if isinstance(value, str):
+            stream = StringIO(value)
+            self.setCharacterStream(stream)
+            self.setEncoding(encoding)
+            b_stream = BytesIOWrapper(value, encoding)
+            self.setByteStream(b_stream)
+        else:
+            stream = BytesIO(value)
+            self.setByteStream(stream)
+            c_stream = TextIOWrapper(stream, encoding)
+            self.setCharacterStream(c_stream)
+            self.setEncoding(c_stream.encoding)
 
 
 headers = {
@@ -134,8 +179,18 @@ class FileInputSource(InputSource):
         system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base)
         super(FileInputSource, self).__init__(system_id)
         self.file = file
-        self.setByteStream(file)
-        # TODO: self.setEncoding(encoding)
+        if isinstance(file, TextIOBase):  # Python3 unicode fp
+            self.setCharacterStream(file)
+            self.setEncoding(file.encoding)
+            try:
+                b = file.buffer
+                self.setByteStream(b)
+            except (AttributeError, LookupError):
+                self.setByteStream(file)
+        else:
+            self.setByteStream(file)
+            # We cannot set characterStream here because
+            # we do not know the Raw Bytes File encoding.
 
     def __repr__(self):
         return repr(self.file)
@@ -171,10 +226,21 @@ def create_input_source(
         else:
             if isinstance(source, str):
                 location = source
+            elif isinstance(source, bytes):
+                data = source
             elif hasattr(source, "read") and not isinstance(source, Namespace):
                 f = source
                 input_source = InputSource()
-                input_source.setByteStream(f)
+                if hasattr(source, "encoding"):
+                    input_source.setCharacterStream(source)
+                    input_source.setEncoding(source.encoding)
+                    try:
+                        b = file.buffer
+                        input_source.setByteStream(b)
+                    except (AttributeError, LookupError):
+                        input_source.setByteStream(source)
+                else:
+                    input_source.setByteStream(f)
                 if f is sys.stdin:
                     input_source.setSystemId("file:///dev/stdin")
                 elif hasattr(f, "name"):
@@ -206,8 +272,8 @@ def create_input_source(
         input_source = FileInputSource(file)
 
     if data is not None:
-        if isinstance(data, str):
-            data = data.encode("utf-8")
+        if not isinstance(data, (str, bytes, bytearray)):
+            raise RuntimeError("parse data can only str, or bytes.")
         input_source = StringInputSource(data)
         auto_close = True
author	Ashley Sommer <ashleysommer@gmail.com>	2020-08-19 12:16:06 +1000
committer	Ashley Sommer <ashleysommer@gmail.com>	2020-08-19 12:16:06 +1000
commit	ceab6b2f71f97e45007b5306d2e0416bfefcea75 (patch)
tree	96e350185347e048382a94da29c3ad04c0d7164f /rdflib/parser.py
parent	89cb369113a28a24d899e0a79123fd51fe6a8119 (diff)
download	rdflib-ceab6b2f71f97e45007b5306d2e0416bfefcea75.tar.gz