diff options
Diffstat (limited to 'sqlparse')
-rw-r--r-- | sqlparse/compat.py | 18 | ||||
-rw-r--r-- | sqlparse/keywords.py | 67 | ||||
-rw-r--r-- | sqlparse/lexer.py | 372 |
3 files changed, 122 insertions, 335 deletions
diff --git a/sqlparse/compat.py b/sqlparse/compat.py index 334883b..c1aacf6 100644 --- a/sqlparse/compat.py +++ b/sqlparse/compat.py @@ -14,11 +14,10 @@ PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 if PY3: - def u(s): + def u(s, encoding=None): return str(s) - range = range text_type = str string_types = (str,) from io import StringIO @@ -33,21 +32,6 @@ elif PY2: return unicode(s, encoding) - range = xrange text_type = unicode string_types = (basestring,) from StringIO import StringIO - - -# Directly copied from six: -def with_metaclass(meta, *bases): - """Create a base class with a metaclass.""" - - # This requires a bit of explanation: the basic idea is to make a dummy - # metaclass for one level of class instantiation that replaces itself with - # the actual metaclass. - class metaclass(meta): - def __new__(cls, name, this_bases, d): - return meta(name, bases, d) - - return type.__new__(metaclass, 'temporary_class', (), {}) diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py index bfea5d1..9cda48a 100644 --- a/sqlparse/keywords.py +++ b/sqlparse/keywords.py @@ -1,5 +1,72 @@ +# -*- coding: utf-8 -*- + from sqlparse import tokens + +def is_keyword(value): + val = value.upper() + return (KEYWORDS_COMMON.get(val) or KEYWORDS.get(val, tokens.Name)), value + + +SQL_REGEX = { + 'root': [ + (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single), + # $ matches *before* newline, therefore we have two patterns + # to match Comment.Single + (r'(--|# ).*?$', tokens.Comment.Single), + (r'(\r\n|\r|\n)', tokens.Newline), + (r'\s+', tokens.Whitespace), + (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), + (r':=', tokens.Assignment), + (r'::', tokens.Punctuation), + (r'[*]', tokens.Wildcard), + (r'CASE\b', tokens.Keyword), # extended CASE(foo) + (r"`(``|[^`])*`", tokens.Name), + (r"´(´´|[^´])*´", tokens.Name), + (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin), + (r'\?{1}', tokens.Name.Placeholder), + (r'%\(\w+\)s', tokens.Name.Placeholder), + (r'%s', tokens.Name.Placeholder), + (r'[$:?]\w+', tokens.Name.Placeholder), + # FIXME(andi): VALUES shouldn't be listed here + # see https://github.com/andialbrecht/sqlparse/pull/64 + (r'VALUES', tokens.Keyword), + (r'(@|##|#)[^\W\d_]\w+', tokens.Name), + # IN is special, it may be followed by a parenthesis, but + # is never a functino, see issue183 + (r'in\b(?=[ (])?', tokens.Keyword), + (r'USING(?=\()', tokens.Keyword), + (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39 + (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal), + (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float), + (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float), + (r'[-]?[0-9]+', tokens.Number.Integer), + (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single), + # not a real string literal in ANSI SQL: + (r'(""|".*?[^\\]")', tokens.String.Symbol), + # sqlite names can be escaped with [square brackets]. left bracket + # cannot be preceded by word character or a right bracket -- + # otherwise it's probably an array index + (r'(?<![\w\])])(\[[^\]]+\])', tokens.Name), + (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?' + r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword), + (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword), + (r'NOT NULL\b', tokens.Keyword), + (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL), + (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin), + (r'(?<=\.)[^\W\d_]\w*', tokens.Name), + (r'[^\W\d]\w*', is_keyword), + (r'[;:()\[\],\.]', tokens.Punctuation), + (r'[<>=~!]+', tokens.Operator.Comparison), + (r'[+/@#%^&|`?^-]+', tokens.Operator), + ], + 'multiline-comments': [ + (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), + (r'\*/', tokens.Comment.Multiline, '#pop'), + (r'[^/\*]+', tokens.Comment.Multiline), + (r'[/*]', tokens.Comment.Multiline), + ]} + KEYWORDS = { 'ABORT': tokens.Keyword, 'ABS': tokens.Keyword, diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py index 6bc49ee..bb7fb48 100644 --- a/sqlparse/lexer.py +++ b/sqlparse/lexer.py @@ -13,245 +13,34 @@ # and to allow some customizations. import re -import sys from sqlparse import tokens -from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON -from sqlparse.compat import StringIO, string_types, with_metaclass, text_type +from sqlparse.keywords import SQL_REGEX +from sqlparse.compat import StringIO, string_types, text_type +from sqlparse.utils import consume -class include(str): - pass - - -class combined(tuple): - """Indicates a state combined from multiple states.""" - - def __new__(cls, *args): - return tuple.__new__(cls, args) - - def __init__(self, *args): - # tuple.__init__ doesn't do anything - pass - - -def is_keyword(value): - test = value.upper() - return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, tokens.Name)), value - - -def apply_filters(stream, filters, lexer=None): - """ - Use this method to apply an iterable of filters to - a stream. If lexer is given it's forwarded to the - filter, otherwise the filter receives `None`. - """ - - def _apply(filter_, stream): - for token in filter_.filter(lexer, stream): - yield token - - for filter_ in filters: - stream = _apply(filter_, stream) - return stream - - -class LexerMeta(type): - """ - Metaclass for Lexer, creates the self._tokens attribute from - self.tokens on the first instantiation. - """ - - def _process_state(cls, unprocessed, processed, state): - assert type(state) is str, "wrong state name %r" % state - assert state[0] != '#', "invalid state name %r" % state - if state in processed: - return processed[state] - tokenlist = processed[state] = [] - rflags = cls.flags - for tdef in unprocessed[state]: - if isinstance(tdef, include): - # it's a state reference - assert tdef != state, "circular state reference %r" % state - tokenlist.extend(cls._process_state( - unprocessed, processed, str(tdef))) - continue - - assert type(tdef) is tuple, "wrong rule def %r" % tdef - - try: - rex = re.compile(tdef[0], rflags).match - except Exception as err: - raise ValueError(("uncompilable regex %r in state" - " %r of %r: %s" - % (tdef[0], state, cls, err))) - - assert type(tdef[1]) is tokens._TokenType or callable(tdef[1]), \ - ('token type must be simple type or callable, not %r' - % (tdef[1],)) - - if len(tdef) == 2: - new_state = None - else: - tdef2 = tdef[2] - if isinstance(tdef2, str): - # an existing state - if tdef2 == '#pop': - new_state = -1 - elif tdef2 in unprocessed: - new_state = (tdef2,) - elif tdef2 == '#push': - new_state = tdef2 - elif tdef2[:5] == '#pop:': - new_state = -int(tdef2[5:]) - else: - assert False, 'unknown new state %r' % tdef2 - elif isinstance(tdef2, combined): - # combine a new state from existing ones - new_state = '_tmp_%d' % cls._tmpname - cls._tmpname += 1 - itokens = [] - for istate in tdef2: - assert istate != state, \ - 'circular state ref %r' % istate - itokens.extend(cls._process_state(unprocessed, - processed, istate)) - processed[new_state] = itokens - new_state = (new_state,) - elif isinstance(tdef2, tuple): - # push more than one state - for state in tdef2: - assert (state in unprocessed or - state in ('#pop', '#push')), \ - 'unknown new state ' + state - new_state = tdef2 - else: - assert False, 'unknown new state def %r' % tdef2 - tokenlist.append((rex, tdef[1], new_state)) - return tokenlist - - def process_tokendef(cls): - cls._all_tokens = {} - cls._tmpname = 0 - processed = cls._all_tokens[cls.__name__] = {} - # tokendefs = tokendefs or cls.tokens[name] - for state in cls.tokens.keys(): - cls._process_state(cls.tokens, processed, state) - return processed - - def __call__(cls, *args, **kwds): - if not hasattr(cls, '_tokens'): - cls._all_tokens = {} - cls._tmpname = 0 - if hasattr(cls, 'token_variants') and cls.token_variants: - # don't process yet - pass - else: - cls._tokens = cls.process_tokendef() - - return type.__call__(cls, *args, **kwds) - - -class _Lexer(object): - - encoding = 'utf-8' - stripall = False - stripnl = False - tabsize = 0 +class Lexer(object): flags = re.IGNORECASE | re.UNICODE - tokens = { - 'root': [ - (r'(--|# ).*?(\r\n|\r|\n)', tokens.Comment.Single), - # $ matches *before* newline, therefore we have two patterns - # to match Comment.Single - (r'(--|# ).*?$', tokens.Comment.Single), - (r'(\r\n|\r|\n)', tokens.Newline), - (r'\s+', tokens.Whitespace), - (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), - (r':=', tokens.Assignment), - (r'::', tokens.Punctuation), - (r'[*]', tokens.Wildcard), - (r'CASE\b', tokens.Keyword), # extended CASE(foo) - (r"`(``|[^`])*`", tokens.Name), - (r"´(´´|[^´])*´", tokens.Name), - (r'\$([^\W\d]\w*)?\$', tokens.Name.Builtin), - (r'\?{1}', tokens.Name.Placeholder), - (r'%\(\w+\)s', tokens.Name.Placeholder), - (r'%s', tokens.Name.Placeholder), - (r'[$:?]\w+', tokens.Name.Placeholder), - # FIXME(andi): VALUES shouldn't be listed here - # see https://github.com/andialbrecht/sqlparse/pull/64 - (r'VALUES', tokens.Keyword), - (r'(@|##|#)[^\W\d_]\w+', tokens.Name), - # IN is special, it may be followed by a parenthesis, but - # is never a functino, see issue183 - (r'in\b(?=[ (])?', tokens.Keyword), - (r'USING(?=\()', tokens.Keyword), - (r'[^\W\d_]\w*(?=[.(])', tokens.Name), # see issue39 - (r'[-]?0x[0-9a-fA-F]+', tokens.Number.Hexadecimal), - (r'[-]?[0-9]*(\.[0-9]+)?[eE][-]?[0-9]+', tokens.Number.Float), - (r'[-]?[0-9]*\.[0-9]+', tokens.Number.Float), - (r'[-]?[0-9]+', tokens.Number.Integer), - (r"'(''|\\\\|\\'|[^'])*'", tokens.String.Single), - # not a real string literal in ANSI SQL: - (r'(""|".*?[^\\]")', tokens.String.Symbol), - # sqlite names can be escaped with [square brackets]. left bracket - # cannot be preceded by word character or a right bracket -- - # otherwise it's probably an array index - (r'(?<![\w\])])(\[[^\]]+\])', tokens.Name), - (r'((LEFT\s+|RIGHT\s+|FULL\s+)?(INNER\s+|OUTER\s+|STRAIGHT\s+)?' - r'|(CROSS\s+|NATURAL\s+)?)?JOIN\b', tokens.Keyword), - (r'END(\s+IF|\s+LOOP|\s+WHILE)?\b', tokens.Keyword), - (r'NOT NULL\b', tokens.Keyword), - (r'CREATE(\s+OR\s+REPLACE)?\b', tokens.Keyword.DDL), - (r'DOUBLE\s+PRECISION\b', tokens.Name.Builtin), - (r'(?<=\.)[^\W\d_]\w*', tokens.Name), - (r'[^\W\d]\w*', is_keyword), - (r'[;:()\[\],\.]', tokens.Punctuation), - (r'[<>=~!]+', tokens.Operator.Comparison), - (r'[+/@#%^&|`?^-]+', tokens.Operator), - ], - 'multiline-comments': [ - (r'/\*', tokens.Comment.Multiline, 'multiline-comments'), - (r'\*/', tokens.Comment.Multiline, '#pop'), - (r'[^/\*]+', tokens.Comment.Multiline), - (r'[/*]', tokens.Comment.Multiline), - ]} - def __init__(self): - self.filters = [] - - def add_filter(self, filter_, **options): - from sqlparse.filters import Filter - if not isinstance(filter_, Filter): - filter_ = filter_(**options) - self.filters.append(filter_) + self._tokens = {} - def _expandtabs(self, text): - if self.tabsize > 0: - text = text.expandtabs(self.tabsize) - return text + for state in SQL_REGEX: + self._tokens[state] = [] - def _decode(self, text): - if sys.version_info[0] == 3: - if isinstance(text, str): - return self._expandtabs(text) - if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') - else: - try: - text = text.decode(self.encoding) - except UnicodeDecodeError: - text = text.decode('unicode-escape') - return self._expandtabs(text) + for tdef in SQL_REGEX[state]: + rex = re.compile(tdef[0], self.flags).match + new_state = None + if len(tdef) > 2: + # Only Multiline comments + if tdef[2] == '#pop': + new_state = -1 + elif tdef[2] in SQL_REGEX: + new_state = (tdef[2],) + self._tokens[state].append((rex, tdef[1], new_state)) - def get_tokens(self, text, unfiltered=False): + def get_tokens(self, text, encoding=None): """ Return an iterable of (tokentype, value) pairs generated from `text`. If `unfiltered` is set to `True`, the filtering mechanism @@ -259,101 +48,51 @@ class _Lexer(object): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. - """ - if isinstance(text, string_types): - if self.stripall: - text = text.strip() - elif self.stripnl: - text = text.strip('\n') - - if sys.version_info[0] < 3 and isinstance(text, text_type): - text = StringIO(text.encode('utf-8')) - self.encoding = 'utf-8' - else: - text = StringIO(text) - - def streamer(): - for i, t, v in self.get_tokens_unprocessed(text): - yield t, v - stream = streamer() - if not unfiltered: - stream = apply_filters(stream, self.filters, self) - return stream - def get_tokens_unprocessed(self, stream, stack=('root',)): - """ Split ``text`` into (tokentype, text) pairs. ``stack`` is the inital stack (default: ``['root']``) """ - pos = 0 - tokendefs = self._tokens # see __call__, pylint:disable=E1101 - statestack = list(stack) - statetokens = tokendefs[statestack[-1]] - known_names = {} + encoding = encoding or 'utf-8' + statestack = ['root', ] + statetokens = self._tokens['root'] - text = stream.read() - text = self._decode(text) + if isinstance(text, string_types): + text = StringIO(text) - while 1: + text = text.read() + if not isinstance(text, text_type): + try: + text = text.decode(encoding) + except UnicodeDecodeError: + text = text.decode('unicode-escape') + + iterable = enumerate(text) + for pos, char in iterable: for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) - if m: - value = m.group() - if value in known_names: - yield pos, known_names[value], value - elif type(action) is tokens._TokenType: - yield pos, action, value - elif hasattr(action, '__call__'): - ttype, value = action(value) - known_names[value] = ttype - yield pos, ttype, value - else: - for item in action(self, m): - yield item - pos = m.end() - if new_state is not None: - # state transition - if isinstance(new_state, tuple): - for state in new_state: - if state == '#pop': - statestack.pop() - elif state == '#push': - statestack.append(statestack[-1]) - elif ( - # Ugly hack - multiline-comments - # are not stackable - state != 'multiline-comments' - or not statestack - or statestack[-1] != 'multiline-comments' - ): - statestack.append(state) - elif isinstance(new_state, int): - # pop - del statestack[new_state:] - elif new_state == '#push': - statestack.append(statestack[-1]) - else: - assert False, "wrong state def: %r" % new_state - statetokens = tokendefs[statestack[-1]] - break - else: - try: - if text[pos] == '\n': - # at EOL, reset state to "root" - pos += 1 - statestack = ['root'] - statetokens = tokendefs['root'] - yield pos, tokens.Text, u'\n' - continue - yield pos, tokens.Error, text[pos] - pos += 1 - except IndexError: - break - -class Lexer(with_metaclass(LexerMeta, _Lexer)): - pass + if not m: + continue + elif isinstance(action, tokens._TokenType): + yield action, m.group() + elif callable(action): + yield action(m.group()) + + if isinstance(new_state, tuple): + for state in new_state: + # fixme: multiline-comments not stackable + if not (state == 'multiline-comments' + and statestack[-1] == 'multiline-comments'): + statestack.append(state) + elif isinstance(new_state, int): + del statestack[new_state:] + statetokens = self._tokens[statestack[-1]] + + consume(iterable, m.end() - pos - 1) + break + else: + yield tokens.Error, char def tokenize(sql, encoding=None): @@ -362,7 +101,4 @@ def tokenize(sql, encoding=None): Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream of ``(token type, value)`` items. """ - lexer = Lexer() - if encoding is not None: - lexer.encoding = encoding - return lexer.get_tokens(sql) + return Lexer().get_tokens(sql, encoding) |