diff options
author | Ashley Sommer <ashleysommer@gmail.com> | 2020-08-19 12:16:06 +1000 |
---|---|---|
committer | Ashley Sommer <ashleysommer@gmail.com> | 2020-08-19 12:16:06 +1000 |
commit | ceab6b2f71f97e45007b5306d2e0416bfefcea75 (patch) | |
tree | 96e350185347e048382a94da29c3ad04c0d7164f /rdflib/parser.py | |
parent | 89cb369113a28a24d899e0a79123fd51fe6a8119 (diff) | |
download | rdflib-ceab6b2f71f97e45007b5306d2e0416bfefcea75.tar.gz |
Merged two Ntriples parser files
Changed name of NTriplesParser to W3CNTriplesParser, it is the legacy parser
Populate CharacterStream attr on several types of rdflib InputSource, to provide unicode text stream, in addition to ByteStream
Add support to N3, Trig, NTriples, NQuads parsers to use the CharacterStream instead of the ByteStream where possible
Reduces many useless string->bytes->string conversions in parsers.
Diffstat (limited to 'rdflib/parser.py')
-rw-r--r-- | rdflib/parser.py | 104 |
1 files changed, 85 insertions, 19 deletions
diff --git a/rdflib/parser.py b/rdflib/parser.py index 9e501c03..73ce2ba7 100644 --- a/rdflib/parser.py +++ b/rdflib/parser.py @@ -9,15 +9,11 @@ can plugin to rdflib. If you are wanting to invoke a parser you likely want to do so through the Graph class parse method. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +import codecs import os import sys -from io import BytesIO - +from io import BytesIO, TextIOBase, TextIOWrapper, StringIO, BufferedIOBase from urllib.request import pathname2url from urllib.request import Request @@ -41,6 +37,8 @@ __all__ = [ class Parser(object): + __slots__ = set() + def __init__(self): pass @@ -48,6 +46,37 @@ class Parser(object): pass +class BytesIOWrapper(BufferedIOBase): + __slots__ = ("wrapped", "encoded", "encoding") + + def __init__(self, wrapped: str, encoding="utf-8"): + super(BytesIOWrapper, self).__init__() + self.wrapped = wrapped + self.encoding = encoding + self.encoded = None + + def read(self, *args, **kwargs): + if self.encoded is None: + b, blen = codecs.getencoder(self.encoding)(self.wrapped) + self.encoded = BytesIO(b) + return self.encoded.read(*args, **kwargs) + + def read1(self, *args, **kwargs): + if self.encoded is None: + b = codecs.getencoder(self.encoding)(self.wrapped) + self.encoded = BytesIO(b) + return self.encoded.read1(*args, **kwargs) + + def readinto(self, *args, **kwargs): + raise NotImplementedError() + + def readinto1(self, *args, **kwargs): + raise NotImplementedError() + + def write(self, *args, **kwargs): + raise NotImplementedError() + + class InputSource(xmlreader.InputSource, object): """ TODO: @@ -59,23 +88,39 @@ class InputSource(xmlreader.InputSource, object): self.auto_close = False # see Graph.parse(), true if opened by us def close(self): + c = self.getCharacterStream() + if c and hasattr(c, "close"): + try: + c.close() + except Exception: + pass f = self.getByteStream() if f and hasattr(f, "close"): - f.close() + try: + f.close() + except Exception: + pass class StringInputSource(InputSource): """ - TODO: + Constructs an RDFLib Parser InputSource from a Python String or Bytes """ - def __init__(self, value, system_id=None): + def __init__(self, value, encoding="utf-8", system_id=None): super(StringInputSource, self).__init__(system_id) - stream = BytesIO(value) - self.setByteStream(stream) - # TODO: - # encoding = value.encoding - # self.setEncoding(encoding) + if isinstance(value, str): + stream = StringIO(value) + self.setCharacterStream(stream) + self.setEncoding(encoding) + b_stream = BytesIOWrapper(value, encoding) + self.setByteStream(b_stream) + else: + stream = BytesIO(value) + self.setByteStream(stream) + c_stream = TextIOWrapper(stream, encoding) + self.setCharacterStream(c_stream) + self.setEncoding(c_stream.encoding) headers = { @@ -134,8 +179,18 @@ class FileInputSource(InputSource): system_id = URIRef(urljoin("file:", pathname2url(file.name)), base=base) super(FileInputSource, self).__init__(system_id) self.file = file - self.setByteStream(file) - # TODO: self.setEncoding(encoding) + if isinstance(file, TextIOBase): # Python3 unicode fp + self.setCharacterStream(file) + self.setEncoding(file.encoding) + try: + b = file.buffer + self.setByteStream(b) + except (AttributeError, LookupError): + self.setByteStream(file) + else: + self.setByteStream(file) + # We cannot set characterStream here because + # we do not know the Raw Bytes File encoding. def __repr__(self): return repr(self.file) @@ -171,10 +226,21 @@ def create_input_source( else: if isinstance(source, str): location = source + elif isinstance(source, bytes): + data = source elif hasattr(source, "read") and not isinstance(source, Namespace): f = source input_source = InputSource() - input_source.setByteStream(f) + if hasattr(source, "encoding"): + input_source.setCharacterStream(source) + input_source.setEncoding(source.encoding) + try: + b = file.buffer + input_source.setByteStream(b) + except (AttributeError, LookupError): + input_source.setByteStream(source) + else: + input_source.setByteStream(f) if f is sys.stdin: input_source.setSystemId("file:///dev/stdin") elif hasattr(f, "name"): @@ -206,8 +272,8 @@ def create_input_source( input_source = FileInputSource(file) if data is not None: - if isinstance(data, str): - data = data.encode("utf-8") + if not isinstance(data, (str, bytes, bytearray)): + raise RuntimeError("parse data can only str, or bytes.") input_source = StringInputSource(data) auto_close = True |