summaryrefslogtreecommitdiff
path: root/sqlparse
diff options
context:
space:
mode:
Diffstat (limited to 'sqlparse')
-rw-r--r--sqlparse/compat.py18
-rw-r--r--sqlparse/keywords.py67
-rw-r--r--sqlparse/lexer.py372
3 files changed, 122 insertions, 335 deletions
diff --git a/sqlparse/compat.py b/sqlparse/compat.py
index 334883b..c1aacf6 100644
--- a/sqlparse/compat.py
+++ b/sqlparse/compat.py
@@ -14,11 +14,10 @@ PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
if PY3:
- def u(s):
+ def u(s, encoding=None):
return str(s)
- range = range
text_type = str
string_types = (str,)
from io import StringIO
@@ -33,21 +32,6 @@ elif PY2:
return unicode(s, encoding)
- range = xrange
text_type = unicode
string_types = (basestring,)
from StringIO import StringIO
-
-
-# Directly copied from six:
-def with_metaclass(meta, *bases):
- """Create a base class with a metaclass."""
-
- # This requires a bit of explanation: the basic idea is to make a dummy
- # metaclass for one level of class instantiation that replaces itself with
- # the actual metaclass.
- class metaclass(meta):
- def __new__(cls, name, this_bases, d):
- return meta(name, bases, d)
-
- return type.__new__(metaclass, 'temporary_class', (), {})
diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py
index bfea5d1..9cda48a 100644
--- a/sqlparse/keywords.py
+++ b/sqlparse/keywords.py
@@ -1,5 +1,72 @@
+# -*- coding: utf-8 -*-
+
from sqlparse import tokens
+
+def is_keyword(value):
+ val = value.upper()
+ return (KEYWORDS_COMMON.get(val) or KEYWORDS.get(val, tokens.Name)), value
+
+
+SQL_REGEX = {
+ 'root': [
+ (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single),
+ # $ matches *before* newline, therefore we have two patterns
+ # to match Comment.Single
+ (r'(--|# ).*?$', tokens.Comment.Single),
+ (r'(\r\n|\r|\n)', tokens.Newline),
+ (r'\s+', tokens.Whitespace),
+ (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
+ (r':=', tokens.Assignment),
+ (r'::', tokens.Punctuation),
+ (r'[*]', tokens.Wildcard),
+ (r'CASE\b', tokens.Keyword), # extended CASE(foo)
+ (r"`(``|[^`])*`", tokens.Name),
+ (r"´(´´|[^´])*´", tokens.Name),
+ (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin),
+ (r'\?{1}', tokens.Name.Placeholder),
+ (r'%\(\w+\)s', tokens.Name.Placeholder),
+ (r'%s', tokens.Name.Placeholder),
+ (r'[$:?]\w+', tokens.Name.Placeholder),
+ # FIXME(andi): VALUES shouldn't be listed here
+ # see https://github.com/andialbrecht/sqlparse/pull/64
+ (r'VALUES', tokens.Keyword),
+ (r'(@|##|#)[^\W\d_]\w+', tokens.Name),
+ # IN is special, it may be followed by a parenthesis, but
+ # is never a functino, see issue183
+ (r'in\b(?=[ (])?', tokens.Keyword),
+ (r'USING(?=\()', tokens.Keyword),
+ (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39
+ (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal),
+ (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float),
+ (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float),
+ (r'[-]?[0-9]+', tokens.Number.Integer),
+ (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
+ # not a real string literal in ANSI SQL:
+ (r'(""|".*?[^\\]")', tokens.String.Symbol),
+ # sqlite names can be escaped with [square brackets]. left bracket
+ # cannot be preceded by word character or a right bracket --
+ # otherwise it's probably an array index
+ (r'(?<![\w\])])(\[[^\]]+\])', tokens.Name),
+ (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
+ r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
+ (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
+ (r'NOT NULL\b', tokens.Keyword),
+ (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
+ (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
+ (r'(?<=\.)[^\W\d_]\w*', tokens.Name),
+ (r'[^\W\d]\w*', is_keyword),
+ (r'[;:()\[\],\.]', tokens.Punctuation),
+ (r'[<>=~!]+', tokens.Operator.Comparison),
+ (r'[+/@#%^&|`?^-]+', tokens.Operator),
+ ],
+ 'multiline-comments': [
+ (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
+ (r'\*/', tokens.Comment.Multiline, '#pop'),
+ (r'[^/\*]+', tokens.Comment.Multiline),
+ (r'[/*]', tokens.Comment.Multiline),
+ ]}
+
KEYWORDS = {
'ABORT': tokens.Keyword,
'ABS': tokens.Keyword,
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
index 6bc49ee..bb7fb48 100644
--- a/sqlparse/lexer.py
+++ b/sqlparse/lexer.py
@@ -13,245 +13,34 @@
# and to allow some customizations.
import re
-import sys
from sqlparse import tokens
-from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
-from sqlparse.compat import StringIO, string_types, with_metaclass, text_type
+from sqlparse.keywords import SQL_REGEX
+from sqlparse.compat import StringIO, string_types, text_type
+from sqlparse.utils import consume
-class include(str):
- pass
-
-
-class combined(tuple):
- """Indicates a state combined from multiple states."""
-
- def __new__(cls, *args):
- return tuple.__new__(cls, args)
-
- def __init__(self, *args):
- # tuple.__init__ doesn't do anything
- pass
-
-
-def is_keyword(value):
- test = value.upper()
- return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value
-
-
-def apply_filters(stream, filters, lexer=None):
- """
- Use this method to apply an iterable of filters to
- a stream. If lexer is given it's forwarded to the
- filter, otherwise the filter receives `None`.
- """
-
- def _apply(filter_, stream):
- for token in filter_.filter(lexer, stream):
- yield token
-
- for filter_ in filters:
- stream = _apply(filter_, stream)
- return stream
-
-
-class LexerMeta(type):
- """
- Metaclass for Lexer, creates the self._tokens attribute from
- self.tokens on the first instantiation.
- """
-
- def _process_state(cls, unprocessed, processed, state):
- assert type(state) is str, "wrong state name %r" % state
- assert state[0] != '#', "invalid state name %r" % state
- if state in processed:
- return processed[state]
- tokenlist = processed[state] = []
- rflags = cls.flags
- for tdef in unprocessed[state]:
- if isinstance(tdef, include):
- # it's a state reference
- assert tdef != state, "circular state reference %r" % state
- tokenlist.extend(cls._process_state(
- unprocessed, processed, str(tdef)))
- continue
-
- assert type(tdef) is tuple, "wrong rule def %r" % tdef
-
- try:
- rex = re.compile(tdef[0], rflags).match
- except Exception as err:
- raise ValueError(("uncompilable regex %r in state"
- " %r of %r: %s"
- % (tdef[0], state, cls, err)))
-
- assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \
- ('token type must be simple type or callable, not %r'
- % (tdef[1],))
-
- if len(tdef) == 2:
- new_state = None
- else:
- tdef2 = tdef[2]
- if isinstance(tdef2, str):
- # an existing state
- if tdef2 == '#pop':
- new_state = -1
- elif tdef2 in unprocessed:
- new_state = (tdef2,)
- elif tdef2 == '#push':
- new_state = tdef2
- elif tdef2[:5] == '#pop:':
- new_state = -int(tdef2[5:])
- else:
- assert False, 'unknown new state %r' % tdef2
- elif isinstance(tdef2, combined):
- # combine a new state from existing ones
- new_state = '_tmp_%d' % cls._tmpname
- cls._tmpname += 1
- itokens = []
- for istate in tdef2:
- assert istate != state, \
- 'circular state ref %r' % istate
- itokens.extend(cls._process_state(unprocessed,
- processed, istate))
- processed[new_state] = itokens
- new_state = (new_state,)
- elif isinstance(tdef2, tuple):
- # push more than one state
- for state in tdef2:
- assert (state in unprocessed or
- state in ('#pop', '#push')), \
- 'unknown new state ' + state
- new_state = tdef2
- else:
- assert False, 'unknown new state def %r' % tdef2
- tokenlist.append((rex, tdef[1], new_state))
- return tokenlist
-
- def process_tokendef(cls):
- cls._all_tokens = {}
- cls._tmpname = 0
- processed = cls._all_tokens[cls.__name__] = {}
- # tokendefs = tokendefs or cls.tokens[name]
- for state in cls.tokens.keys():
- cls._process_state(cls.tokens, processed, state)
- return processed
-
- def __call__(cls, *args, **kwds):
- if not hasattr(cls, '_tokens'):
- cls._all_tokens = {}
- cls._tmpname = 0
- if hasattr(cls, 'token_variants') and cls.token_variants:
- # don't process yet
- pass
- else:
- cls._tokens = cls.process_tokendef()
-
- return type.__call__(cls, *args, **kwds)
-
-
-class _Lexer(object):
-
- encoding = 'utf-8'
- stripall = False
- stripnl = False
- tabsize = 0
+class Lexer(object):
flags = re.IGNORECASE | re.UNICODE
- tokens = {
- 'root': [
- (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single),
- # $ matches *before* newline, therefore we have two patterns
- # to match Comment.Single
- (r'(--|# ).*?$', tokens.Comment.Single),
- (r'(\r\n|\r|\n)', tokens.Newline),
- (r'\s+', tokens.Whitespace),
- (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
- (r':=', tokens.Assignment),
- (r'::', tokens.Punctuation),
- (r'[*]', tokens.Wildcard),
- (r'CASE\b', tokens.Keyword), # extended CASE(foo)
- (r"`(``|[^`])*`", tokens.Name),
- (r"´(´´|[^´])*´", tokens.Name),
- (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin),
- (r'\?{1}', tokens.Name.Placeholder),
- (r'%\(\w+\)s', tokens.Name.Placeholder),
- (r'%s', tokens.Name.Placeholder),
- (r'[$:?]\w+', tokens.Name.Placeholder),
- # FIXME(andi): VALUES shouldn't be listed here
- # see https://github.com/andialbrecht/sqlparse/pull/64
- (r'VALUES', tokens.Keyword),
- (r'(@|##|#)[^\W\d_]\w+', tokens.Name),
- # IN is special, it may be followed by a parenthesis, but
- # is never a functino, see issue183
- (r'in\b(?=[ (])?', tokens.Keyword),
- (r'USING(?=\()', tokens.Keyword),
- (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39
- (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal),
- (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float),
- (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float),
- (r'[-]?[0-9]+', tokens.Number.Integer),
- (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single),
- # not a real string literal in ANSI SQL:
- (r'(""|".*?[^\\]")', tokens.String.Symbol),
- # sqlite names can be escaped with [square brackets]. left bracket
- # cannot be preceded by word character or a right bracket --
- # otherwise it's probably an array index
- (r'(?<![\w\])])(\[[^\]]+\])', tokens.Name),
- (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?'
- r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword),
- (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword),
- (r'NOT NULL\b', tokens.Keyword),
- (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL),
- (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin),
- (r'(?<=\.)[^\W\d_]\w*', tokens.Name),
- (r'[^\W\d]\w*', is_keyword),
- (r'[;:()\[\],\.]', tokens.Punctuation),
- (r'[<>=~!]+', tokens.Operator.Comparison),
- (r'[+/@#%^&|`?^-]+', tokens.Operator),
- ],
- 'multiline-comments': [
- (r'/\*', tokens.Comment.Multiline, 'multiline-comments'),
- (r'\*/', tokens.Comment.Multiline, '#pop'),
- (r'[^/\*]+', tokens.Comment.Multiline),
- (r'[/*]', tokens.Comment.Multiline),
- ]}
-
def __init__(self):
- self.filters = []
-
- def add_filter(self, filter_, **options):
- from sqlparse.filters import Filter
- if not isinstance(filter_, Filter):
- filter_ = filter_(**options)
- self.filters.append(filter_)
+ self._tokens = {}
- def _expandtabs(self, text):
- if self.tabsize > 0:
- text = text.expandtabs(self.tabsize)
- return text
+ for state in SQL_REGEX:
+ self._tokens[state] = []
- def _decode(self, text):
- if sys.version_info[0] == 3:
- if isinstance(text, str):
- return self._expandtabs(text)
- if self.encoding == 'guess':
- try:
- text = text.decode('utf-8')
- if text.startswith(u'\ufeff'):
- text = text[len(u'\ufeff'):]
- except UnicodeDecodeError:
- text = text.decode('latin1')
- else:
- try:
- text = text.decode(self.encoding)
- except UnicodeDecodeError:
- text = text.decode('unicode-escape')
- return self._expandtabs(text)
+ for tdef in SQL_REGEX[state]:
+ rex = re.compile(tdef[0], self.flags).match
+ new_state = None
+ if len(tdef) > 2:
+ # Only Multiline comments
+ if tdef[2] == '#pop':
+ new_state = -1
+ elif tdef[2] in SQL_REGEX:
+ new_state = (tdef[2],)
+ self._tokens[state].append((rex, tdef[1], new_state))
- def get_tokens(self, text, unfiltered=False):
+ def get_tokens(self, text, encoding=None):
"""
Return an iterable of (tokentype, value) pairs generated from
`text`. If `unfiltered` is set to `True`, the filtering mechanism
@@ -259,101 +48,51 @@ class _Lexer(object):
Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.
- """
- if isinstance(text, string_types):
- if self.stripall:
- text = text.strip()
- elif self.stripnl:
- text = text.strip('\n')
-
- if sys.version_info[0] < 3 and isinstance(text, text_type):
- text = StringIO(text.encode('utf-8'))
- self.encoding = 'utf-8'
- else:
- text = StringIO(text)
-
- def streamer():
- for i, t, v in self.get_tokens_unprocessed(text):
- yield t, v
- stream = streamer()
- if not unfiltered:
- stream = apply_filters(stream, self.filters, self)
- return stream
- def get_tokens_unprocessed(self, stream, stack=('root',)):
- """
Split ``text`` into (tokentype, text) pairs.
``stack`` is the inital stack (default: ``['root']``)
"""
- pos = 0
- tokendefs = self._tokens # see __call__, pylint:disable=E1101
- statestack = list(stack)
- statetokens = tokendefs[statestack[-1]]
- known_names = {}
+ encoding = encoding or 'utf-8'
+ statestack = ['root', ]
+ statetokens = self._tokens['root']
- text = stream.read()
- text = self._decode(text)
+ if isinstance(text, string_types):
+ text = StringIO(text)
- while 1:
+ text = text.read()
+ if not isinstance(text, text_type):
+ try:
+ text = text.decode(encoding)
+ except UnicodeDecodeError:
+ text = text.decode('unicode-escape')
+
+ iterable = enumerate(text)
+ for pos, char in iterable:
for rexmatch, action, new_state in statetokens:
m = rexmatch(text, pos)
- if m:
- value = m.group()
- if value in known_names:
- yield pos, known_names[value], value
- elif type(action) is tokens._TokenType:
- yield pos, action, value
- elif hasattr(action, '__call__'):
- ttype, value = action(value)
- known_names[value] = ttype
- yield pos, ttype, value
- else:
- for item in action(self, m):
- yield item
- pos = m.end()
- if new_state is not None:
- # state transition
- if isinstance(new_state, tuple):
- for state in new_state:
- if state == '#pop':
- statestack.pop()
- elif state == '#push':
- statestack.append(statestack[-1])
- elif (
- # Ugly hack - multiline-comments
- # are not stackable
- state != 'multiline-comments'
- or not statestack
- or statestack[-1] != 'multiline-comments'
- ):
- statestack.append(state)
- elif isinstance(new_state, int):
- # pop
- del statestack[new_state:]
- elif new_state == '#push':
- statestack.append(statestack[-1])
- else:
- assert False, "wrong state def: %r" % new_state
- statetokens = tokendefs[statestack[-1]]
- break
- else:
- try:
- if text[pos] == '\n':
- # at EOL, reset state to "root"
- pos += 1
- statestack = ['root']
- statetokens = tokendefs['root']
- yield pos, tokens.Text, u'\n'
- continue
- yield pos, tokens.Error, text[pos]
- pos += 1
- except IndexError:
- break
-
-class Lexer(with_metaclass(LexerMeta, _Lexer)):
- pass
+ if not m:
+ continue
+ elif isinstance(action, tokens._TokenType):
+ yield action, m.group()
+ elif callable(action):
+ yield action(m.group())
+
+ if isinstance(new_state, tuple):
+ for state in new_state:
+ # fixme: multiline-comments not stackable
+ if not (state == 'multiline-comments'
+ and statestack[-1] == 'multiline-comments'):
+ statestack.append(state)
+ elif isinstance(new_state, int):
+ del statestack[new_state:]
+ statetokens = self._tokens[statestack[-1]]
+
+ consume(iterable, m.end() - pos - 1)
+ break
+ else:
+ yield tokens.Error, char
def tokenize(sql, encoding=None):
@@ -362,7 +101,4 @@ def tokenize(sql, encoding=None):
Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
of ``(token type, value)`` items.
"""
- lexer = Lexer()
- if encoding is not None:
- lexer.encoding = encoding
- return lexer.get_tokens(sql)
+ return Lexer().get_tokens(sql, encoding)