diff options
author | Andi Albrecht <albrecht.andi@gmail.com> | 2009-04-03 21:26:42 +0200 |
---|---|---|
committer | Andi Albrecht <albrecht.andi@gmail.com> | 2009-04-03 21:26:42 +0200 |
commit | 361122eb22d5681c58dac731009e4814b3dd5fa5 (patch) | |
tree | b096496bc9c6b8febe092d0aefd56de1a4f8f4a0 /sqlparse | |
download | sqlparse-361122eb22d5681c58dac731009e4814b3dd5fa5.tar.gz |
Initial import.
Diffstat (limited to 'sqlparse')
-rw-r--r-- | sqlparse/__init__.py | 65 | ||||
-rw-r--r-- | sqlparse/dialects.py | 88 | ||||
-rw-r--r-- | sqlparse/engine/__init__.py | 81 | ||||
-rw-r--r-- | sqlparse/engine/_grouping.py | 499 | ||||
-rw-r--r-- | sqlparse/engine/filter.py | 98 | ||||
-rw-r--r-- | sqlparse/engine/grouping.py | 537 | ||||
-rw-r--r-- | sqlparse/filters.py | 432 | ||||
-rw-r--r-- | sqlparse/formatter.py | 163 | ||||
-rw-r--r-- | sqlparse/keywords.py | 589 | ||||
-rw-r--r-- | sqlparse/lexer.py | 310 | ||||
-rw-r--r-- | sqlparse/tokens.py | 131 |
11 files changed, 2993 insertions, 0 deletions
diff --git a/sqlparse/__init__.py b/sqlparse/__init__.py new file mode 100644 index 0000000..01b3bd8 --- /dev/null +++ b/sqlparse/__init__.py @@ -0,0 +1,65 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +"""Parse SQL statements.""" + +__version__ = '0.1.0' + +import logging +import os + + +if 'SQLPARSE_DEBUG' in os.environ: + logging.basicConfig(level=logging.DEBUG) + + +class SQLParseError(Exception): + """Base class for exceptions in this module.""" + + +# Setup namespace +from sqlparse import engine +from sqlparse import filters +from sqlparse import formatter + + +def parse(sql): + """Parse sql and return a list of statements. + + *sql* is a single string containting one or more SQL statements. + + The returned :class:`~sqlparse.parser.Statement` are fully analyzed. + + Returns a list of :class:`~sqlparse.parser.Statement` instances. + """ + stack = engine.FilterStack() + stack.full_analyze() + return tuple(stack.run(sql)) + + +def format(sql, **options): + """Format *sql* according to *options*. + + Returns a list of :class:`~sqlparse.parse.Statement` instances like + :meth:`parse`, but the statements are formatted according to *options*. + + Available options are documented in the :mod:`~sqlparse.format` module. + """ + stack = engine.FilterStack() + options = formatter.validate_options(options) + stack = formatter.build_filter_stack(stack, options) + stack.postprocess.append(filters.SerializerUnicode()) + return ''.join(stack.run(sql)) + + +def split(sql): + """Split *sql* into separate statements. + + Returns a list of strings. + """ + stack = engine.FilterStack() + stack.split_statements = True + return [unicode(stmt) for stmt in stack.run(sql)] + diff --git a/sqlparse/dialects.py b/sqlparse/dialects.py new file mode 100644 index 0000000..cabe503 --- /dev/null +++ b/sqlparse/dialects.py @@ -0,0 +1,88 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +"""This module contains classes that represent SQL dialects.""" + +from tokens import * + + +class Dialect(object): + """Base class for SQL dialect implementations.""" + + def handle_token(self, tokentype, text): + """Handle a token. + + Arguments: + tokentype: A token type. + text: Text representation of the token. + + Returns: + A tuple of three items: tokentype, text, splitlevel. + splitlevel is either -1, 0 or 1 and describes an identation level. + """ + raise NotImplementedError + + def reset(self): + """Reset Dialect state.""" + pass + + +class DefaultDialect(Dialect): + + def __init__(self): + self._in_declare = False + self._stmt_type = None + + def get_statement_type(self): + return self._stmt_type + + def set_statement_type(self, type_): + self._stmt_type = type_ + + def handle_token(self, tokentype, text): + if not tokentype == Keyword: + return tokentype, text, 0 + unified = text.upper() + if unified == 'DECLARE': + self._in_declare = True + return tokentype, text, 1 + if unified == 'BEGIN': + if self._in_declare: + return tokentype, text, 0 + return tokentype, text, 0 + if unified == 'END': + return tokentype, text, -1 + # TODO: Use a constant here + if unified in ('IF', 'FOR') and self._stmt_type == 6: + return tokentype, text, 1 + return tokentype, text, 0 + + def reset(self): + self._in_declare = False + + +class PSQLDialect(DefaultDialect): + + def __init__(self): + super(PSQLDialect, self).__init__() + self._in_dbldollar = False + + def handle_token(self, tokentype, text): + if (tokentype == Name.Builtin + and text.startswith('$') and text.endswith('$')): + if self._in_dbldollar: + self._in_dbldollar = False + return tokentype, text, -1 + else: + self._in_dbldollar = True + return tokentype, text, 1 + elif self._in_dbldollar: + return tokentype, text, 0 + else: + return super(PSQLDialect, self).handle_token(tokentype, text) + + def reset(self): + self._dollar_started = False + self._in_dbldollar = False diff --git a/sqlparse/engine/__init__.py b/sqlparse/engine/__init__.py new file mode 100644 index 0000000..5cac528 --- /dev/null +++ b/sqlparse/engine/__init__.py @@ -0,0 +1,81 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +"""filter""" + +import logging +import re + +from sqlparse import lexer, SQLParseError +from sqlparse.engine import grouping +from sqlparse.engine.filter import StatementFilter + +# XXX remove this when cleanup is complete +Filter = object + + +class FilterStack(object): + + def __init__(self): + self.preprocess = [] + self.stmtprocess = [] + self.postprocess = [] + self.split_statements = False + self._grouping = False + + def _flatten(self, stream): + for token in stream: + if token.is_group(): + for t in self._flatten(token.tokens): + yield t + else: + yield token + + def enable_grouping(self): + self._grouping = True + + def full_analyze(self): + self.enable_grouping() + + def run(self, sql): + stream = lexer.tokenize(sql) + # Process token stream + if self.preprocess: + for filter_ in self.preprocess: + stream = filter_.process(self, stream) + + if (self.stmtprocess or self.postprocess or self.split_statements + or self._grouping): + splitter = StatementFilter() + stream = splitter.process(self, stream) + + if self._grouping: + def _group(stream): + for stmt in stream: + grouping.group(stmt) + yield stmt + stream = _group(stream) + + if self.stmtprocess: + def _run(stream): + ret = [] + for stmt in stream: + for filter_ in self.stmtprocess: + filter_.process(self, stmt) + ret.append(stmt) + return ret + stream = _run(stream) + + if self.postprocess: + def _run(stream): + for stmt in stream: + stmt.tokens = list(self._flatten(stmt.tokens)) + for filter_ in self.postprocess: + stmt = filter_.process(self, stmt) + yield stmt + stream = _run(stream) + + return stream + diff --git a/sqlparse/engine/_grouping.py b/sqlparse/engine/_grouping.py new file mode 100644 index 0000000..512c590 --- /dev/null +++ b/sqlparse/engine/_grouping.py @@ -0,0 +1,499 @@ +# -*- coding: utf-8 -*- + +import re + +from sqlparse.engine.filter import TokenFilter +from sqlparse import tokens as T + +class _Base(object): + + __slots__ = ('to_unicode', 'to_str', '_get_repr_name') + + def __unicode__(self): + return 'Unkown _Base object' + + def __str__(self): + return unicode(self).encode('latin-1') + + def __repr__(self): + raw = unicode(self) + if len(raw) > 7: + short = raw[:6]+u'...' + else: + short = raw + short = re.sub('\s+', ' ', short) + return '<%s \'%s\' at 0x%07x>' % (self._get_repr_name(), + short, id(self)) + + def _get_repr_name(self): + return self.__class__.__name__ + + def to_unicode(self): + return unicode(self) + + def to_str(self): + return str(self) + + +class Token(_Base): + + __slots__ = ('value', 'ttype') + + def __init__(self, ttype, value): + self.value = value + self.ttype = ttype + + def __unicode__(self): + return self.value + + def _get_repr_name(self): + return str(self.ttype).split('.')[-1] + + def match(self, ttype, values): + if self.ttype is not ttype: + return False + if isinstance(values, basestring): + values = [values] + if self.ttype is T.Keyword: + return self.value.upper() in [v.upper() for v in values] + else: + return self.value in values + + def is_group(self): + return False + + def is_whitespace(self): + return self.ttype and self.ttype is T.Whitespace + + +class _Group(Token): + + __slots__ = ('value', 'ttype', 'tokens') + + def __init__(self, tokens=None): + super(_Group, self).__init__(None, None) + if tokens is None: + tokens = [] + self._tokens = tokens + + def _set_tokens(self, tokens): + self._tokens = tokens + def _get_tokens(self): + if type(self._tokens) is not types.TupleType: + self._tokens = tuple(self._tokens) + return self._tokens + tokens = property(fget=_get_tokens, fset=_set_tokens) + + def _get_repr_name(self): + return self.__class__.__name__ + + def _pprint_tree(self, depth=0): + """Pretty-print the object tree.""" + indent = ' '*(depth*2) + for token in self.tokens: + print '%s%r' % (indent, token) + if token.is_group(): + token._pprint_tree(depth+1) + + def __unicode__(self): + return u''.join(unicode(t) for t in self.tokens) + + @property + def subgroups(self): + #return [x for x in self.tokens if isinstance(x, _Group)] + for item in self.tokens: + if item.is_group(): + yield item + + def is_group(self): + return True + + +class Statement(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + +class Parenthesis(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + +class Where(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + +class CommentMulti(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + +class Identifier(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + +class TypeCast(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + @property + def casted_object(self): + return self.tokens[0] + + @property + def casted_type(self): + return self.tokens[-1] + + +class Alias(_Group): + __slots__ = ('value', 'ttype', '_tokens') + + @property + def aliased_object(self): + return self.tokens[0] + + @property + def alias(self): + return self.tokens[-1] + + + + +# - Filter + +class StatementFilter(TokenFilter): + + def __init__(self): + self._in_declare = False + self._in_dbldollar = False + self._is_create = False + + def _reset(self): + self._in_declare = False + self._in_dbldollar = False + self._is_create = False + + def _change_splitlevel(self, ttype, value): + # PostgreSQL + if (ttype == T.Name.Builtin + and value.startswith('$') and value.endswith('$')): + if self._in_dbldollar: + self._in_dbldollar = False + return -1 + else: + self._in_dbldollar = True + return 1 + elif self._in_dbldollar: + return 0 + + # ANSI + if ttype is not T.Keyword: + return 0 + + unified = value.upper() + + if unified == 'DECLARE': + self._in_declare = True + return 1 + + if unified == 'BEGIN': + if self._in_declare: + return 0 + return 0 + + if unified == 'END': + return -1 + + if ttype is T.Keyword.DDL and unified.startswith('CREATE'): + self._is_create = True + + if unified in ('IF', 'FOR') and self._is_create: + return 1 + + # Default + return 0 + + def process(self, stack, stream): + splitlevel = 0 + stmt = None + consume_ws = False + stmt_tokens = [] + for ttype, value in stream: + # Before appending the token + if (consume_ws and ttype is not T.Whitespace + and ttype is not T.Comment.Single): + consume_ws = False + stmt.tokens = stmt_tokens + yield stmt + self._reset() + stmt = None + splitlevel = 0 + if stmt is None: + stmt = Statement() + stmt_tokens = [] + splitlevel += self._change_splitlevel(ttype, value) + # Append the token + stmt_tokens.append(Token(ttype, value)) + # After appending the token + if (not splitlevel and ttype is T.Punctuation + and value == ';'): + consume_ws = True + if stmt is not None: + stmt.tokens = stmt_tokens + yield stmt + + +class GroupFilter(object): + + def process(self, stream): + pass + + +class GroupParenthesis(GroupFilter): + """Group parenthesis groups.""" + + def _finish_group(self, group): + start = group[0] + end = group[-1] + tokens = list(self._process(group[1:-1])) + return [start]+tokens+[end] + + def _process(self, stream): + group = None + depth = 0 + for token in stream: + if token.is_group(): + token.tokens = self._process(token.tokens) + if token.match(T.Punctuation, '('): + if depth == 0: + group = [] + depth += 1 + if group is not None: + group.append(token) + if token.match(T.Punctuation, ')'): + depth -= 1 + if depth == 0: + yield Parenthesis(self._finish_group(group)) + group = None + continue + if group is None: + yield token + + def process(self, group): + if not isinstance(group, Parenthesis): + group.tokens = self._process(group.tokens) + + +class GroupWhere(GroupFilter): + + def _process(self, stream): + group = None + depth = 0 + for token in stream: + if token.is_group(): + token.tokens = self._process(token.tokens) + if token.match(T.Keyword, 'WHERE'): + if depth == 0: + group = [] + depth += 1 + # Process conditions here? E.g. "A =|!=|in|is|... B"... + elif (token.ttype is T.Keyword + and token.value.upper() in ('ORDER', 'GROUP', + 'LIMIT', 'UNION')): + depth -= 1 + if depth == 0: + yield Where(group) + group = None + if depth < 0: + depth = 0 + if group is not None: + group.append(token) + else: + yield token + if group is not None: + yield Where(group) + + def process(self, group): + if not isinstance(group, Where): + group.tokens = self._process(group.tokens) + + +class GroupMultiComments(GroupFilter): + """Groups Comment.Multiline and adds trailing whitespace up to first lb.""" + + def _process(self, stream): + new_tokens = [] + grp = None + consume_ws = False + for token in stream: + if token.is_group(): + token.tokens = self._process(token.tokens) + if token.ttype is T.Comment.Multiline: + if grp is None: + grp = [] + consume_ws = True + grp.append(token) + elif consume_ws and token.ttype is not T.Whitespace: + yield CommentMulti(grp) + grp = None + consume_ws = False + yield token + elif consume_ws: + lines = token.value.splitlines(True) + grp.append(Token(T.Whitespace, lines[0])) + if lines[0].endswith('\n'): + yield CommentMulti(grp) + grp = None + consume_ws = False + if lines[1:]: + yield Token(T.Whitespace, ''.join(lines[1:])) + else: + yield token + + def process(self, group): + if not isinstance(group, CommentMulti): + group.tokens = self._process(group.tokens) + + +## class GroupIdentifier(GroupFilter): + +## def _process(self, stream): +## buff = [] +## expect_dot = False +## for token in stream: +## if token.is_group(): +## token.tokens = self._process(token.tokens) +## if (token.ttype is T.String.Symbol or token.ttype is T.Name +## and not expect_dot): +## buff.append(token) +## expect_dot = True +## elif expect_dot and token.match(T.Punctuation, '.'): +## buff.append(token) +## expect_dot = False +## else: +## if expect_dot == False: +## # something's wrong, it ends with a dot... +## while buff: +## yield buff.pop(0) +## expect_dot = False +## elif buff: +## idt = Identifier() +## idt.tokens = buff +## yield idt +## buff = [] +## yield token +## if buff and expect_dot: +## idt = Identifier() +## idt.tokens = buff +## yield idt +## buff = [] +## while buff: +## yield buff.pop(0) + +## def process(self, group): +## if not isinstance(group, Identifier): +## group.tokens = self._process(group.tokens) + + +class AddTypeCastFilter(GroupFilter): + + def _process(self, stream): + buff = [] + expect_colon = False + has_colons = False + for token in stream: + if token.is_group(): + token.tokens = self._process(token.tokens) + if ((isinstance(token, Parenthesis) + or isinstance(token, Identifier)) + and not expect_colon): + buff.append(token) + expect_colon = True + elif expect_colon and token.match(T.Punctuation, ':'): + buff.append(token) + has_colons = True + elif (expect_colon + and (token.ttype in T.Name + or isinstance(token, Identifier)) + ): + if not has_colons: + while buff: + yield buff.pop(0) + yield token + else: + buff.append(token) + grp = TypeCast() + grp.tokens = buff + buff = [] + yield grp + expect_colons = has_colons = False + else: + while buff: + yield buff.pop(0) + yield token + while buff: + yield buff.pop(0) + + def process(self, group): + if not isinstance(group, TypeCast): + group.tokens = self._process(group.tokens) + + +class AddAliasFilter(GroupFilter): + + def _process(self, stream): + buff = [] + search_alias = False + lazy = False + for token in stream: + if token.is_group(): + token.tokens = self._process(token.tokens) + if search_alias and (isinstance(token, Identifier) + or token.ttype in (T.Name, + T.String.Symbol) + or (lazy and not token.is_whitespace())): + buff.append(token) + search_alias = lazy = False + grp = Alias() + grp.tokens = buff + buff = [] + yield grp + elif (isinstance(token, (Identifier, TypeCast)) + or token.ttype in (T.Name, T.String.Symbol)): + buff.append(token) + search_alias = True + elif search_alias and (token.is_whitespace() + or token.match(T.Keyword, 'as')): + buff.append(token) + if token.match(T.Keyword, 'as'): + lazy = True + else: + while buff: + yield buff.pop(0) + yield token + search_alias = False + while buff: + yield buff.pop(0) + + def process(self, group): + if not isinstance(group, Alias): + group.tokens = self._process(group.tokens) + + +GROUP_FILTER = (GroupParenthesis(), + GroupMultiComments(), + GroupWhere(), + GroupIdentifier(), + AddTypeCastFilter(), + AddAliasFilter(), + ) + +import types +def group_tokens(group): + def _materialize(g): + if type(g.tokens) is not types.TupleType: + g.tokens = tuple(g.tokens) + for sg in g.subgroups: + _materialize(sg) + for groupfilter in GROUP_FILTER: + groupfilter.process(group) +# _materialize(group) +# group.tokens = tuple(group.tokens) +# for subgroup in group.subgroups: +# group_tokens(subgroup) diff --git a/sqlparse/engine/filter.py b/sqlparse/engine/filter.py new file mode 100644 index 0000000..146690c --- /dev/null +++ b/sqlparse/engine/filter.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +from sqlparse import tokens as T +from sqlparse.engine.grouping import Statement, Token + + +class TokenFilter(object): + + def __init__(self, **options): + self.options = options + + def process(self, stack, stream): + """Process token stream.""" + raise NotImplementedError + + +class StatementFilter(TokenFilter): + + def __init__(self): + self._in_declare = False + self._in_dbldollar = False + self._is_create = False + + def _reset(self): + self._in_declare = False + self._in_dbldollar = False + self._is_create = False + + def _change_splitlevel(self, ttype, value): + # PostgreSQL + if (ttype == T.Name.Builtin + and value.startswith('$') and value.endswith('$')): + if self._in_dbldollar: + self._in_dbldollar = False + return -1 + else: + self._in_dbldollar = True + return 1 + elif self._in_dbldollar: + return 0 + + # ANSI + if ttype is not T.Keyword: + return 0 + + unified = value.upper() + + if unified == 'DECLARE': + self._in_declare = True + return 1 + + if unified == 'BEGIN': + if self._in_declare: + return 0 + return 0 + + if unified == 'END': + # Should this respect a preceeding BEGIN? + # In CASE ... WHEN ... END this results in a split level -1. + return -1 + + if ttype is T.Keyword.DDL and unified.startswith('CREATE'): + self._is_create = True + + if unified in ('IF', 'FOR') and self._is_create: + return 1 + + # Default + return 0 + + def process(self, stack, stream): + splitlevel = 0 + stmt = None + consume_ws = False + stmt_tokens = [] + for ttype, value in stream: + # Before appending the token + if (consume_ws and ttype is not T.Whitespace + and ttype is not T.Comment.Single): + consume_ws = False + stmt.tokens = stmt_tokens + yield stmt + self._reset() + stmt = None + splitlevel = 0 + if stmt is None: + stmt = Statement() + stmt_tokens = [] + splitlevel += self._change_splitlevel(ttype, value) + # Append the token + stmt_tokens.append(Token(ttype, value)) + # After appending the token + if (splitlevel <= 0 and ttype is T.Punctuation + and value == ';'): + consume_ws = True + if stmt is not None: + stmt.tokens = stmt_tokens + yield stmt diff --git a/sqlparse/engine/grouping.py b/sqlparse/engine/grouping.py new file mode 100644 index 0000000..433f539 --- /dev/null +++ b/sqlparse/engine/grouping.py @@ -0,0 +1,537 @@ +# -*- coding: utf-8 -*- + +import itertools +import re +import types + +from sqlparse import tokens as T + + +class Token(object): + + __slots__ = ('value', 'ttype') + + def __init__(self, ttype, value): + self.value = value + self.ttype = ttype + + def __str__(self): + return unicode(self).encode('latin-1') + + def __repr__(self): + short = self._get_repr_value() + return '<%s \'%s\' at 0x%07x>' % (self._get_repr_name(), + short, id(self)) + + def __unicode__(self): + return self.value + + def to_unicode(self): + return unicode(self) + + def _get_repr_name(self): + return str(self.ttype).split('.')[-1] + + def _get_repr_value(self): + raw = unicode(self) + if len(raw) > 7: + short = raw[:6]+u'...' + else: + short = raw + return re.sub('\s+', ' ', short) + + def match(self, ttype, values, regex=False): + if self.ttype is not ttype: + return False + if values is None: + return self.ttype is ttype + if isinstance(values, basestring): + values = [values] + if regex: + if self.ttype is T.Keyword: + values = [re.compile(v, re.IGNORECASE) for v in values] + else: + values = [re.compile(v) for v in values] + for pattern in values: + if pattern.search(self.value): + return True + return False + else: + if self.ttype is T.Keyword: + return self.value.upper() in [v.upper() for v in values] + else: + return self.value in values + + def is_group(self): + return False + + def is_whitespace(self): + return self.ttype and self.ttype in T.Whitespace + + +class TokenList(Token): + + __slots__ = ('value', 'ttype', 'tokens') + + def __init__(self, tokens=None): + if tokens is None: + tokens = [] + self.tokens = tokens + Token.__init__(self, None, None) + + def __unicode__(self): + return ''.join(unicode(x) for x in self.flatten()) + + def __str__(self): + return unicode(self).encode('latin-1') + + def _get_repr_name(self): + return self.__class__.__name__ + + def _pprint_tree(self, max_depth=None, depth=0): + """Pretty-print the object tree.""" + indent = ' '*(depth*2) + for token in self.tokens: + if token.is_group(): + pre = ' | ' + else: + pre = ' | ' + print '%s%s%s \'%s\'' % (indent, pre, token._get_repr_name(), + token._get_repr_value()) + if (token.is_group() and max_depth is not None + and depth < max_depth): + token._pprint_tree(max_depth, depth+1) + + def flatten(self): + for token in self.tokens: + if isinstance(token, TokenList): + for item in token.flatten(): + yield item + else: + yield token + + def is_group(self): + return True + + def get_sublists(self): + return [x for x in self.tokens if isinstance(x, TokenList)] + + def token_first(self, ignore_whitespace=True): + for token in self.tokens: + if ignore_whitespace and token.is_whitespace(): + continue + return token + return None + + def token_next_by_instance(self, idx, clss): + if type(clss) not in (types.ListType, types.TupleType): + clss = (clss,) + if type(clss) is not types.TupleType: + clss = tuple(clss) + for token in self.tokens[idx:]: + if isinstance(token, clss): + return token + return None + + def token_next_by_type(self, idx, ttypes): + if not isinstance(ttypes, (types.TupleType, types.ListType)): + ttypes = [ttypes] + for token in self.tokens[idx:]: + if token.ttype in ttypes: + return token + return None + + def token_next_match(self, idx, ttype, value, regex=False): + if type(idx) != types.IntType: + idx = self.token_index(idx) + for token in self.tokens[idx:]: + if token.match(ttype, value, regex): + return token + return None + + def token_not_matching(self, idx, funcs): + for token in self.tokens[idx:]: + passed = False + for func in funcs: + if func(token): + passed = True + break + if not passed: + return token + return None + + def token_prev(self, idx, skip_ws=True): + while idx != 0: + idx -= 1 + if self.tokens[idx].is_whitespace() and skip_ws: + continue + return self.tokens[idx] + + def token_next(self, idx, skip_ws=True): + while idx < len(self.tokens)-1: + idx += 1 + if self.tokens[idx].is_whitespace() and skip_ws: + continue + return self.tokens[idx] + + def token_index(self, token): + """Return list index of token.""" + return self.tokens.index(token) + + def tokens_between(self, start, end, exclude_end=False): + """Return all tokens between (and including) start and end.""" + if exclude_end: + offset = 0 + else: + offset = 1 + return self.tokens[self.token_index(start):self.token_index(end)+offset] + + def group_tokens(self, grp_cls, tokens): + """Replace tokens by instance of grp_cls.""" + idx = self.token_index(tokens[0]) + for t in tokens: + self.tokens.remove(t) + grp = grp_cls(tokens) + self.tokens.insert(idx, grp) + return grp + + def insert_before(self, where, token): + self.tokens.insert(self.token_index(where), token) + + +class Statement(TokenList): + + __slots__ = ('value', 'ttype', 'tokens') + + def get_type(self): + first_token = self.token_first() + if first_token.ttype in (T.Keyword.DML, T.Keyword.DDL): + return first_token.value.upper() + else: + return 'UNKNOWN' + + +class Identifier(TokenList): + + __slots__ = ('value', 'ttype', 'tokens') + + def has_alias(self): + return self.get_alias() is not None + + def get_alias(self): + kw = self.token_next_match(0, T.Keyword, 'AS') + if kw is not None: + alias = self.token_next(self.token_index(kw)) + if alias is None: + return None + else: + next_ = self.token_next(0) + if next_ is None or not isinstance(next_, Identifier): + return None + alias = next_ + if isinstance(alias, Identifier): + return alias.get_name() + else: + return alias.to_unicode() + + def get_name(self): + alias = self.get_alias() + if alias is not None: + return alias + return self.get_real_name() + + def get_real_name(self): + return self.token_next_by_type(0, T.Name).value + + def get_typecast(self): + marker = self.token_next_match(0, T.Punctuation, '::') + if marker is None: + return None + next_ = self.token_next(self.token_index(marker), False) + if next_ is None: + return None + return next_.to_unicode() + + +class IdentifierList(TokenList): + + __slots__ = ('value', 'ttype', 'tokens') + + def get_identifiers(self): + return [x for x in self.tokens if isinstance(x, Identifier)] + + +class Parenthesis(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + + +class Assignment(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + +class If(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + +class For(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + +class Comparsion(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + +class Comment(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + +class Where(TokenList): + __slots__ = ('value', 'ttype', 'tokens') + + +class Case(TokenList): + + __slots__ = ('value', 'ttype', 'tokens') + + def get_cases(self): + """Returns a list of 2-tuples (condition, value). + + If an ELSE exists condition is None. + """ + ret = [] + in_condition = in_value = False + for token in self.tokens: + if token.match(T.Keyword, 'WHEN'): + ret.append(([], [])) + in_condition = True + in_value = False + elif token.match(T.Keyword, 'ELSE'): + ret.append((None, [])) + in_condition = False + in_value = True + elif token.match(T.Keyword, 'THEN'): + in_condition = False + in_value = True + elif token.match(T.Keyword, 'END'): + in_condition = False + in_value = False + if in_condition: + ret[-1][0].append(token) + elif in_value: + ret[-1][1].append(token) + return ret + +def _group_left_right(tlist, ttype, value, cls, + check_right=lambda t: True, + include_semicolon=False): +# [_group_left_right(sgroup, ttype, value, cls, check_right, +# include_semicolon) for sgroup in tlist.get_sublists() +# if not isinstance(sgroup, cls)] + idx = 0 + token = tlist.token_next_match(idx, ttype, value) + while token: + right = tlist.token_next(tlist.token_index(token)) + left = tlist.token_prev(tlist.token_index(token)) + if (right is None or not check_right(right) + or left is None): + token = tlist.token_next_match(tlist.token_index(token)+1, + ttype, value) + else: + if include_semicolon: + right = tlist.token_next_match(tlist.token_index(right), + T.Punctuation, ';') + tokens = tlist.tokens_between(left, right)[1:] + if not isinstance(left, cls): + new = cls([left]) + new_idx = tlist.token_index(left) + tlist.tokens.remove(left) + tlist.tokens.insert(new_idx, new) + left = new + left.tokens.extend(tokens) + for t in tokens: + tlist.tokens.remove(t) + token = tlist.token_next_match(tlist.token_index(left)+1, + ttype, value) + +def _group_matching(tlist, start_ttype, start_value, end_ttype, end_value, + cls, include_semicolon=False, recurse=False): + def _find_matching(i, tl, stt, sva, ett, eva): + depth = 1 + for t in tl.tokens[i:]: + if t.match(stt, sva): + depth += 1 + elif t.match(ett, eva): + depth -= 1 + if depth == 1: + return t + return None + [_group_matching(sgroup, start_ttype, start_value, end_ttype, end_value, + cls, include_semicolon) for sgroup in tlist.get_sublists() + if recurse] + if isinstance(tlist, cls): + idx = 1 + else: + idx = 0 + token = tlist.token_next_match(idx, start_ttype, start_value) + while token: + tidx = tlist.token_index(token) + end = _find_matching(tidx, tlist, start_ttype, start_value, + end_ttype, end_value) + if end is None: + idx = tidx+1 + else: + if include_semicolon: + next_ = tlist.token_next(tlist.token_index(end)) + if next_ and next_.match(T.Punctuation, ';'): + end = next_ + group = tlist.group_tokens(cls, tlist.tokens_between(token, end)) + _group_matching(group, start_ttype, start_value, + end_ttype, end_value, cls, include_semicolon) + idx = tlist.token_index(group)+1 + token = tlist.token_next_match(idx, start_ttype, start_value) + +def group_if(tlist): + _group_matching(tlist, T.Keyword, 'IF', T.Keyword, 'END IF', If, True) + +def group_for(tlist): + _group_matching(tlist, T.Keyword, 'FOR', T.Keyword, 'END LOOP', For, True) + +def group_as(tlist): + _group_left_right(tlist, T.Keyword, 'AS', Identifier) + +def group_assignment(tlist): + _group_left_right(tlist, T.Assignment, ':=', Assignment, + include_semicolon=True) + +def group_comparsion(tlist): + _group_left_right(tlist, T.Operator, None, Comparsion) + + +def group_case(tlist): + _group_matching(tlist, T.Keyword, 'CASE', T.Keyword, 'END', Case, True) + + +def group_identifier(tlist): + def _consume_cycle(tl, i): + x = itertools.cycle((lambda y: y.match(T.Punctuation, '.'), + lambda y: y.ttype in (T.String.Symbol, T.Name))) + for t in tl.tokens[i:]: + if x.next()(t): + yield t + else: + raise StopIteration + + # bottom up approach: group subgroups first + [group_identifier(sgroup) for sgroup in tlist.get_sublists() + if not isinstance(sgroup, Identifier)] + + # real processing + idx = 0 + token = tlist.token_next_by_type(idx, (T.String.Symbol, T.Name)) + while token: + identifier_tokens = [token]+list( + _consume_cycle(tlist, + tlist.token_index(token)+1)) + group = tlist.group_tokens(Identifier, identifier_tokens) + idx = tlist.token_index(group)+1 + token = tlist.token_next_by_type(idx, (T.String.Symbol, T.Name)) + + +def group_identifier_list(tlist): + [group_identifier_list(sgroup) for sgroup in tlist.get_sublists() + if not isinstance(sgroup, IdentifierList)] + idx = 0 + token = tlist.token_next_by_instance(idx, Identifier) + while token: + tidx = tlist.token_index(token) + end = tlist.token_not_matching(tidx+1, + [lambda t: isinstance(t, Identifier), + lambda t: t.is_whitespace(), + lambda t: t.match(T.Punctuation, + ',') + ]) + if end is None: + idx = tidx + 1 + else: + grp_tokens = tlist.tokens_between(token, end, exclude_end=True) + while grp_tokens and (grp_tokens[-1].is_whitespace() + or grp_tokens[-1].match(T.Punctuation, ',')): + grp_tokens.pop() + if len(grp_tokens) <= 1: + idx = tidx + 1 + else: + group = tlist.group_tokens(IdentifierList, grp_tokens) + idx = tlist.token_index(group) + token = tlist.token_next_by_instance(idx, Identifier) + + +def group_parenthesis(tlist): + _group_matching(tlist, T.Punctuation, '(', T.Punctuation, ')', Parenthesis) + +def group_comments(tlist): + [group_comments(sgroup) for sgroup in tlist.get_sublists() + if not isinstance(sgroup, Comment)] + idx = 0 + token = tlist.token_next_by_type(idx, T.Comment) + while token: + tidx = tlist.token_index(token) + end = tlist.token_not_matching(tidx+1, + [lambda t: t.ttype in T.Comment, + lambda t: t.is_whitespace()]) + if end is None: + idx = tidx + 1 + else: + eidx = tlist.token_index(end) + grp_tokens = tlist.tokens_between(token, + tlist.token_prev(eidx, False)) + group = tlist.group_tokens(Comment, grp_tokens) + idx = tlist.token_index(group) + token = tlist.token_next_by_type(idx, T.Comment) + +def group_where(tlist): + [group_where(sgroup) for sgroup in tlist.get_sublists() + if not isinstance(sgroup, Where)] + idx = 0 + token = tlist.token_next_match(idx, T.Keyword, 'WHERE') + stopwords = ('ORDER', 'GROUP', 'LIMIT', 'UNION') + while token: + tidx = tlist.token_index(token) + end = tlist.token_next_match(tidx+1, T.Keyword, stopwords) + if end is None: + end = tlist.tokens[-1] + else: + end = tlist.tokens[tlist.token_index(end)-1] + group = tlist.group_tokens(Where, tlist.tokens_between(token, end)) + idx = tlist.token_index(group) + token = tlist.token_next_match(idx, T.Keyword, 'WHERE') + +def group_aliased(tlist): + [group_aliased(sgroup) for sgroup in tlist.get_sublists() + if not isinstance(sgroup, Identifier)] + idx = 0 + token = tlist.token_next_by_instance(idx, Identifier) + while token: + next_ = tlist.token_next(tlist.token_index(token)) + if next_ is not None and isinstance(next_, Identifier): + grp = tlist.tokens_between(token, next_)[1:] + token.tokens.extend(grp) + for t in grp: + tlist.tokens.remove(t) + idx = tlist.token_index(token)+1 + token = tlist.token_next_by_instance(idx, Identifier) + + +def group_typecasts(tlist): + _group_left_right(tlist, T.Punctuation, '::', Identifier) + + +def group(tlist): + for func in [group_parenthesis, + group_comments, + group_where, + group_case, + group_identifier, + group_typecasts, + group_as, + group_aliased, + group_assignment, + group_comparsion, + group_identifier_list, + group_if, + group_for,]: + func(tlist) diff --git a/sqlparse/filters.py b/sqlparse/filters.py new file mode 100644 index 0000000..695b298 --- /dev/null +++ b/sqlparse/filters.py @@ -0,0 +1,432 @@ +# -*- coding: utf-8 -*- + +import re + +from sqlparse.engine import grouping +from sqlparse import tokens as T + + +class Filter(object): + + def process(self, *args): + raise NotImplementedError + + +class TokenFilter(Filter): + + def process(self, stack, stream): + raise NotImplementedError + + +# FIXME: Should be removed +def rstrip(stream): + buff = [] + for token in stream: + if token.is_whitespace() and '\n' in token.value: + # assuming there's only one \n in value + before, rest = token.value.split('\n', 1) + token.value = '\n%s' % rest + buff = [] + yield token + elif token.is_whitespace(): + buff.append(token) + elif token.is_group(): + token.tokens = list(rstrip(token.tokens)) + # process group and look if it starts with a nl + if token.tokens and token.tokens[0].is_whitespace(): + before, rest = token.tokens[0].value.split('\n', 1) + token.tokens[0].value = '\n%s' % rest + buff = [] + while buff: + yield buff.pop(0) + yield token + else: + while buff: + yield buff.pop(0) + yield token + + +# -------------------------- +# token process + +class _CaseFilter(TokenFilter): + + ttype = None + + def __init__(self, case=None): + if case is None: + case = 'upper' + assert case in ['lower', 'upper', 'capitalize'] + self.convert = getattr(unicode, case) + + def process(self, stack, stream): + for ttype, value in stream: + if ttype in self.ttype: + value = self.convert(value) + yield ttype, value + + +class KeywordCaseFilter(_CaseFilter): + ttype = T.Keyword + + +class IdentifierCaseFilter(_CaseFilter): + ttype = (T.Name, T.String.Symbol) + + +# ---------------------- +# statement process + +class StripCommentsFilter(Filter): + + def _process(self, tlist): + idx = 0 + clss = set([x.__class__ for x in tlist.tokens]) + while grouping.Comment in clss: + token = tlist.token_next_by_instance(0, grouping.Comment) + tidx = tlist.token_index(token) + prev = tlist.token_prev(tidx, False) + next_ = tlist.token_next(tidx, False) + # Replace by whitespace if prev and next exist and if they're not + # whitespaces. This doesn't apply if prev or next is a paranthesis. + if (prev is not None and next_ is not None + and not prev.is_whitespace() and not next_.is_whitespace() + and not (prev.match(T.Punctuation, '(') + or next_.match(T.Punctuation, ')'))): + tlist.tokens[tidx] = grouping.Token(T.Whitespace, ' ') + else: + tlist.tokens.pop(tidx) + clss = set([x.__class__ for x in tlist.tokens]) + + def process(self, stack, stmt): + [self.process(stack, sgroup) for sgroup in stmt.get_sublists()] + self._process(stmt) + + +class StripWhitespaceFilter(Filter): + + def _stripws(self, tlist): + func_name = '_stripws_%s' % tlist.__class__.__name__.lower() + func = getattr(self, func_name, self._stripws_default) + func(tlist) + + def _stripws_default(self, tlist): + last_was_ws = False + for token in tlist.tokens: + if token.is_whitespace(): + if last_was_ws: + token.value = '' + else: + token.value = ' ' + last_was_ws = token.is_whitespace() + + def _stripws_parenthesis(self, tlist): + if tlist.tokens[1].is_whitespace(): + tlist.tokens.pop(1) + if tlist.tokens[-2].is_whitespace(): + tlist.tokens.pop(-2) + self._stripws_default(tlist) + + def process(self, stack, stmt): + [self.process(stack, sgroup) for sgroup in stmt.get_sublists()] + self._stripws(stmt) + if stmt.tokens[-1].is_whitespace(): + stmt.tokens.pop(-1) + + +class ReindentFilter(Filter): + + def __init__(self, width=2, char=' ', line_width=None): + self.width = width + self.char = char + self.indent = 0 + self.offset = 0 + self.line_width = line_width + self._curr_stmt = None + self._last_stmt = None + + def _get_offset(self, token): + all_ = list(self._curr_stmt.flatten()) + idx = all_.index(token) + raw = ''.join(unicode(x) for x in all_[:idx+1]) + line = raw.splitlines()[-1] + # Now take current offset into account and return relative offset. + full_offset = len(line)-(len(self.char*(self.width*self.indent))) + return full_offset - self.offset + + def nl(self): + # TODO: newline character should be configurable + ws = '\n'+(self.char*((self.indent*self.width)+self.offset)) + return grouping.Token(T.Whitespace, ws) + + def _split_kwds(self, tlist): + split_words = ('FROM', 'JOIN$', 'AND', 'OR', + 'GROUP', 'ORDER', 'UNION', 'VALUES') + idx = 0 + token = tlist.token_next_match(idx, T.Keyword, split_words, + regex=True) + while token: + prev = tlist.token_prev(tlist.token_index(token), False) + offset = 1 + if prev and prev.is_whitespace(): + tlist.tokens.pop(tlist.token_index(prev)) + offset += 1 + nl = self.nl() + tlist.insert_before(token, nl) + token = tlist.token_next_match(tlist.token_index(nl)+offset, + T.Keyword, split_words, regex=True) + + def _split_statements(self, tlist): + idx = 0 + token = tlist.token_next_by_type(idx, (T.Keyword.DDL, T.Keyword.DML)) + while token: + prev = tlist.token_prev(tlist.token_index(token), False) + if prev and prev.is_whitespace(): + tlist.tokens.pop(tlist.token_index(prev)) + # only break if it's not the first token + if prev: + nl = self.nl() + tlist.insert_before(token, nl) + token = tlist.token_next_by_type(tlist.token_index(token)+1, + (T.Keyword.DDL, T.Keyword.DML)) + + def _process(self, tlist): + func_name = '_process_%s' % tlist.__class__.__name__.lower() + func = getattr(self, func_name, self._process_default) + func(tlist) + + def _process_where(self, tlist): + token = tlist.token_next_match(0, T.Keyword, 'WHERE') + tlist.insert_before(token, self.nl()) + self.indent += 1 + self._process_default(tlist) + self.indent -= 1 + + def _process_parenthesis(self, tlist): + first = tlist.token_next(0) + indented = False + if first and first.ttype in (T.Keyword.DML, T.Keyword.DDL): + self.indent += 1 + tlist.tokens.insert(0, self.nl()) + indented = True + num_offset = self._get_offset(tlist.token_next_match(0, + T.Punctuation, '(')) + self.offset += num_offset + self._process_default(tlist, stmts=not indented) + if indented: + self.indent -= 1 + self.offset -= num_offset + + def _process_identifierlist(self, tlist): + identifiers = tlist.get_identifiers() + if len(identifiers) > 1: + first = list(identifiers[0].flatten())[0] + num_offset = self._get_offset(first)-len(first.value) + self.offset += num_offset + for token in identifiers[1:]: + tlist.insert_before(token, self.nl()) + self.offset -= num_offset + self._process_default(tlist) + + def _process_case(self, tlist): + cases = tlist.get_cases() + is_first = True + num_offset = None + case = tlist.tokens[0] + outer_offset = self._get_offset(case)-len(case.value) + self.offset += outer_offset + for cond, value in tlist.get_cases(): + if is_first: + is_first = False + num_offset = self._get_offset(cond[0])-len(cond[0].value) + self.offset += num_offset + continue + if cond is None: + token = value[0] + else: + token = cond[0] + tlist.insert_before(token, self.nl()) + # Line breaks on group level are done. Now let's add an offset of + # 5 (=length of "when", "then", "else") and process subgroups. + self.offset += 5 + self._process_default(tlist) + self.offset -= 5 + if num_offset is not None: + self.offset -= num_offset + end = tlist.token_next_match(0, T.Keyword, 'END') + tlist.insert_before(end, self.nl()) + self.offset -= outer_offset + + def _process_default(self, tlist, stmts=True, kwds=True): + if stmts: + self._split_statements(tlist) + if kwds: + self._split_kwds(tlist) + [self._process(sgroup) for sgroup in tlist.get_sublists()] + + def process(self, stack, stmt): + if isinstance(stmt, grouping.Statement): + self._curr_stmt = stmt + self._process(stmt) + if isinstance(stmt, grouping.Statement): + if self._last_stmt is not None: + if self._last_stmt.to_unicode().endswith('\n'): + nl = '\n' + else: + nl = '\n\n' + stmt.tokens.insert(0, + grouping.Token(T.Whitespace, nl)) + if self._last_stmt != stmt: + self._last_stmt = stmt + + +# FIXME: Doesn't work ;) +class RightMarginFilter(Filter): + + keep_together = ( +# grouping.TypeCast, grouping.Identifier, grouping.Alias, + ) + + def __init__(self, width=79): + self.width = width + self.line = '' + + def _process(self, stack, group, stream): + for token in stream: + if token.is_whitespace() and '\n' in token.value: + if token.value.endswith('\n'): + self.line = '' + else: + self.line = token.value.splitlines()[-1] + elif (token.is_group() + and not token.__class__ in self.keep_together): + token.tokens = self._process(stack, token, token.tokens) + else: + val = token.to_unicode() + if len(self.line) + len(val) > self.width: + match = re.search('^ +', self.line) + if match is not None: + indent = match.group() + else: + indent = '' + yield grouping.Token(T.Whitespace, '\n%s' % indent) + self.line = indent + self.line += val + yield token + + def process(self, stack, group): + return + group.tokens = self._process(stack, group, group.tokens) + + +# --------------------------- +# postprocess + +class SerializerUnicode(Filter): + + def process(self, stack, stmt): + raw = stmt.to_unicode() + add_nl = raw.endswith('\n') + res = '\n'.join(line.rstrip() for line in raw.splitlines()) + if add_nl: + res += '\n' + return res + + +class OutputPythonFilter(Filter): + + def __init__(self, varname='sql'): + self.varname = varname + self.cnt = 0 + + def _process(self, stream, varname, count, has_nl): + if count > 1: + yield grouping.Token(T.Whitespace, '\n') + yield grouping.Token(T.Name, varname) + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Operator, '=') + yield grouping.Token(T.Whitespace, ' ') + if has_nl: + yield grouping.Token(T.Operator, '(') + yield grouping.Token(T.Text, "'") + cnt = 0 + for token in stream: + cnt += 1 + if token.is_whitespace() and '\n' in token.value: + if cnt == 1: + continue + after_lb = token.value.split('\n', 1)[1] + yield grouping.Token(T.Text, "'") + yield grouping.Token(T.Whitespace, '\n') + for i in range(len(varname)+4): + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Text, "'") + if after_lb: # it's the indendation + yield grouping.Token(T.Whitespace, after_lb) + continue + elif token.value and "'" in token.value: + token.value = token.value.replace("'", "\\'") + yield grouping.Token(T.Text, token.value or '') + yield grouping.Token(T.Text, "'") + if has_nl: + yield grouping.Token(T.Operator, ')') + + def process(self, stack, stmt): + self.cnt += 1 + if self.cnt > 1: + varname = '%s%d' % (self.varname, self.cnt) + else: + varname = self.varname + has_nl = len(stmt.to_unicode().strip().splitlines()) > 1 + stmt.tokens = self._process(stmt.tokens, varname, self.cnt, has_nl) + return stmt + + +class OutputPHPFilter(Filter): + + def __init__(self, varname='sql'): + self.varname = '$%s' % varname + self.count = 0 + + def _process(self, stream, varname): + if self.count > 1: + yield grouping.Token(T.Whitespace, '\n') + yield grouping.Token(T.Name, varname) + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Operator, '=') + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Text, '"') + cnt = 0 + for token in stream: + if token.is_whitespace() and '\n' in token.value: + cnt += 1 + if cnt == 1: + continue + after_lb = token.value.split('\n', 1)[1] + yield grouping.Token(T.Text, '"') + yield grouping.Token(T.Operator, ';') + yield grouping.Token(T.Whitespace, '\n') + yield grouping.Token(T.Name, varname) + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Punctuation, '.') + yield grouping.Token(T.Operator, '=') + yield grouping.Token(T.Whitespace, ' ') + yield grouping.Token(T.Text, '"') + if after_lb: + yield grouping.Token(T.Text, after_lb) + continue + elif '"' in token.value: + token.value = token.value.replace('"', '\\"') + yield grouping.Token(T.Text, token.value) + yield grouping.Token(T.Text, '"') + yield grouping.Token(T.Punctuation, ';') + + def process(self, stack, stmt): + self.count += 1 + if self.count > 1: + varname = '%s%d' % (self.varname, self.count) + else: + varname = self.varname + stmt.tokens = tuple(self._process(stmt.tokens, varname)) + return stmt + diff --git a/sqlparse/formatter.py b/sqlparse/formatter.py new file mode 100644 index 0000000..9d443ca --- /dev/null +++ b/sqlparse/formatter.py @@ -0,0 +1,163 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +"""SQL formatter""" + +import logging + +from sqlparse import SQLParseError +from sqlparse import filters + + +def validate_options(options): + """Validates options.""" + kwcase = options.get('keyword_case', None) + if kwcase not in [None, 'upper', 'lower', 'capitalize']: + raise SQLParseError('Invalid value for keyword_case: %r' % kwcase) + + idcase = options.get('identifier_case', None) + if idcase not in [None, 'upper', 'lower', 'capitalize']: + raise SQLParseError('Invalid value for identifier_case: %r' % idcase) + + ofrmt = options.get('output_format', None) + if ofrmt not in [None, 'sql', 'python', 'php']: + raise SQLParseError('Unknown output format: %r' % ofrmt) + + strip_comments = options.get('strip_comments', False) + if strip_comments not in [True, False]: + raise SQLParseError('Invalid value for strip_comments: %r' + % strip_comments) + + strip_ws = options.get('strip_whitespace', False) + if strip_ws not in [True, False]: + raise SQLParseError('Invalid value for strip_whitespace: %r' + % strip_ws) + + reindent = options.get('reindent', False) + if reindent not in [True, False]: + raise SQLParseError('Invalid value for reindent: %r' + % reindent) + elif reindent: + options['strip_whitespace'] = True + indent_tabs = options.get('indent_tabs', False) + if indent_tabs not in [True, False]: + raise SQLParserError('Invalid value for indent_tabs: %r' % indent_tabs) + elif indent_tabs: + options['indent_char'] = '\t' + else: + options['indent_char'] = ' ' + indent_width = options.get('indent_width', 2) + try: + indent_width = int(indent_width) + except (TypeError, ValueError): + raise SQLParseError('indent_width requires an integer') + if indent_width < 1: + raise SQLParseError('indent_width requires an positive integer') + options['indent_width'] = indent_width + + right_margin = options.get('right_margin', None) + if right_margin is not None: + try: + right_margin = int(right_margin) + except (TypeError, ValueError): + raise SQLParseError('right_margin requires an integer') + if right_margin < 10: + raise SQLParseError('right_margin requires an integer > 10') + options['right_margin'] = right_margin + + return options + + +def build_filter_stack(stack, options): + """Setup and return a filter stack. + + Args: + stack: :class:`~sqlparse.filters.FilterStack` instance + options: Dictionary with options validated by validate_options. + """ + # Token filter + if 'keyword_case' in options: + stack.preprocess.append( + filters.KeywordCaseFilter(options['keyword_case'])) + + if 'identifier_case' in options: + stack.preprocess.append( + filters.IdentifierCaseFilter(options['identifier_case'])) + + # After grouping + if options.get('strip_comments', False): + stack.enable_grouping() + stack.stmtprocess.append(filters.StripCommentsFilter()) + + if (options.get('strip_whitespace', False) + or options.get('reindent', False)): + stack.enable_grouping() + stack.stmtprocess.append(filters.StripWhitespaceFilter()) + + if options.get('reindent', False): + stack.enable_grouping() + stack.stmtprocess.append( + filters.ReindentFilter(char=options['indent_char'], + width=options['indent_width'])) + + if options.get('right_margin', False): + stack.enable_grouping() + stack.stmtprocess.append( + filters.RightMarginFilter(width=options['right_margin'])) + + # Serializer + if options.get('output_format'): + frmt = options['output_format'] + if frmt.lower() == 'php': + fltr = filters.OutputPHPFilter() + elif frmt.lower() == 'python': + fltr = filters.OutputPythonFilter() + else: + fltr = None + if fltr is not None: + stack.postprocess.append(fltr) + + return stack + + +def format(statement, **options): + import filters + logging.info('OPTIONS %r', options) + lexer = Lexer() +# lexer.add_filter('whitespace') + lexer.add_filter(filters.GroupFilter()) + if options.get('reindent', False): + lexer.add_filter(filters.StripWhitespaceFilter()) + lexer.add_filter(filters.IndentFilter( + n_indents=options.get('n_indents', 2))) + if options.get('ltrim', False): + lexer.add_filter(filters.LTrimFilter()) + keyword_case = options.get('keyword_case', None) + if keyword_case is not None: + assert keyword_case in ('lower', 'upper', 'capitalize') + lexer.add_filter(filters.KeywordCaseFilter(case=keyword_case)) + identifier_case = options.get('identifier_case', None) + if identifier_case is not None: + assert identifier_case in ('lower', 'upper', 'capitalize') + lexer.add_filter(filters.IdentifierCaseFilter(case=identifier_case)) + if options.get('strip_comments', False): + lexer.add_filter(filters.StripCommentsFilter()) + right_margin = options.get('right_margin', None) + if right_margin is not None: + right_margin = int(right_margin) + assert right_margin > 0 + lexer.add_filter(filters.RightMarginFilter(margin=right_margin)) + lexer.add_filter(filters.UngroupFilter()) + if options.get('output_format', None): + ofrmt = options['output_format'] + assert ofrmt in ('sql', 'python', 'php') + if ofrmt == 'python': + lexer.add_filter(filters.OutputPythonFilter()) + elif ofrmt == 'php': + lexer.add_filter(filters.OutputPHPFilter()) + tokens = [] + for ttype, value in lexer.get_tokens(unicode(statement)): + tokens.append((ttype, value)) + return statement.__class__(tokens) diff --git a/sqlparse/keywords.py b/sqlparse/keywords.py new file mode 100644 index 0000000..3f0632e --- /dev/null +++ b/sqlparse/keywords.py @@ -0,0 +1,589 @@ +from sqlparse.tokens import * + +KEYWORDS = { + 'ABORT': Keyword, + 'ABS': Keyword, + 'ABSOLUTE': Keyword, + 'ACCESS': Keyword, + 'ADA': Keyword, + 'ADD': Keyword, + 'ADMIN': Keyword, + 'AFTER': Keyword, + 'AGGREGATE': Keyword, + 'ALIAS': Keyword, + 'ALL': Keyword, + 'ALLOCATE': Keyword, + 'ANALYSE': Keyword, + 'ANALYZE': Keyword, + 'AND': Keyword, + 'ANY': Keyword, + 'ARE': Keyword, + 'AS': Keyword, + 'ASC': Keyword, + 'ASENSITIVE': Keyword, + 'ASSERTION': Keyword, + 'ASSIGNMENT': Keyword, + 'ASYMMETRIC': Keyword, + 'AT': Keyword, + 'ATOMIC': Keyword, + 'AUTHORIZATION': Keyword, + 'AVG': Keyword, + + 'BACKWARD': Keyword, + 'BEFORE': Keyword, + 'BEGIN': Keyword, + 'BETWEEN': Keyword, + 'BITVAR': Keyword, + 'BIT_LENGTH': Keyword, + 'BOTH': Keyword, + 'BREADTH': Keyword, + 'BY': Keyword, + + 'C': Keyword, + 'CACHE': Keyword, + 'CALL': Keyword, + 'CALLED': Keyword, + 'CARDINALITY': Keyword, + 'CASCADE': Keyword, + 'CASCADED': Keyword, + 'CASE': Keyword, + 'CAST': Keyword, + 'CATALOG': Keyword, + 'CATALOG_NAME': Keyword, + 'CHAIN': Keyword, + 'CHARACTERISTICS': Keyword, + 'CHARACTER_LENGTH': Keyword, + 'CHARACTER_SET_CATALOG': Keyword, + 'CHARACTER_SET_NAME': Keyword, + 'CHARACTER_SET_SCHEMA': Keyword, + 'CHAR_LENGTH': Keyword, + 'CHECK': Keyword, + 'CHECKED': Keyword, + 'CHECKPOINT': Keyword, + 'CLASS': Keyword, + 'CLASS_ORIGIN': Keyword, + 'CLOB': Keyword, + 'CLOSE': Keyword, + 'CLUSTER': Keyword, + 'COALSECE': Keyword, + 'COBOL': Keyword, + 'COLLATE': Keyword, + 'COLLATION': Keyword, + 'COLLATION_CATALOG': Keyword, + 'COLLATION_NAME': Keyword, + 'COLLATION_SCHEMA': Keyword, + 'COLUMN': Keyword, + 'COLUMN_NAME': Keyword, + 'COMMAND_FUNCTION': Keyword, + 'COMMAND_FUNCTION_CODE': Keyword, + 'COMMENT': Keyword, + 'COMMIT': Keyword, + 'COMMITTED': Keyword, + 'COMPLETION': Keyword, + 'CONDITION_NUMBER': Keyword, + 'CONNECT': Keyword, + 'CONNECTION': Keyword, + 'CONNECTION_NAME': Keyword, + 'CONSTRAINT': Keyword, + 'CONSTRAINTS': Keyword, + 'CONSTRAINT_CATALOG': Keyword, + 'CONSTRAINT_NAME': Keyword, + 'CONSTRAINT_SCHEMA': Keyword, + 'CONSTRUCTOR': Keyword, + 'CONTAINS': Keyword, + 'CONTINUE': Keyword, + 'CONVERSION': Keyword, + 'CONVERT': Keyword, + 'COPY': Keyword, + 'CORRESPONTING': Keyword, + 'COUNT': Keyword, + 'CREATEDB': Keyword, + 'CREATEUSER': Keyword, + 'CROSS': Keyword, + 'CUBE': Keyword, + 'CURRENT': Keyword, + 'CURRENT_DATE': Keyword, + 'CURRENT_PATH': Keyword, + 'CURRENT_ROLE': Keyword, + 'CURRENT_TIME': Keyword, + 'CURRENT_TIMESTAMP': Keyword, + 'CURRENT_USER': Keyword, + 'CURSOR': Keyword, + 'CURSOR_NAME': Keyword, + 'CYCLE': Keyword, + + 'DATA': Keyword, + 'DATABASE': Keyword, + 'DATETIME_INTERVAL_CODE': Keyword, + 'DATETIME_INTERVAL_PRECISION': Keyword, + 'DAY': Keyword, + 'DEALLOCATE': Keyword, + 'DECLARE': Keyword, + 'DEFAULT': Keyword, + 'DEFAULTS': Keyword, + 'DEFERRABLE': Keyword, + 'DEFERRED': Keyword, + 'DEFINED': Keyword, + 'DEFINER': Keyword, + 'DELIMITER': Keyword, + 'DELIMITERS': Keyword, + 'DEREF': Keyword, + 'DESC': Keyword, + 'DESCRIBE': Keyword, + 'DESCRIPTOR': Keyword, + 'DESTROY': Keyword, + 'DESTRUCTOR': Keyword, + 'DETERMINISTIC': Keyword, + 'DIAGNOSTICS': Keyword, + 'DICTIONARY': Keyword, + 'DISCONNECT': Keyword, + 'DISPATCH': Keyword, + 'DISTINCT': Keyword, + 'DO': Keyword, + 'DOMAIN': Keyword, + 'DYNAMIC': Keyword, + 'DYNAMIC_FUNCTION': Keyword, + 'DYNAMIC_FUNCTION_CODE': Keyword, + + 'EACH': Keyword, + 'ELSE': Keyword, + 'ENCODING': Keyword, + 'ENCRYPTED': Keyword, + 'END': Keyword, + 'END-EXEC': Keyword, + 'EQUALS': Keyword, + 'ESCAPE': Keyword, + 'EVERY': Keyword, + 'EXCEPT': Keyword, + 'ESCEPTION': Keyword, + 'EXCLUDING': Keyword, + 'EXCLUSIVE': Keyword, + 'EXEC': Keyword, + 'EXECUTE': Keyword, + 'EXISTING': Keyword, + 'EXISTS': Keyword, + 'EXTERNAL': Keyword, + 'EXTRACT': Keyword, + + 'FALSE': Keyword, + 'FETCH': Keyword, + 'FINAL': Keyword, + 'FIRST': Keyword, + 'FOR': Keyword, + 'FORCE': Keyword, + 'FOREIGN': Keyword, + 'FORTRAN': Keyword, + 'FORWARD': Keyword, + 'FOUND': Keyword, + 'FREE': Keyword, + 'FREEZE': Keyword, + 'FROM': Keyword, + 'FULL': Keyword, + 'FUNCTION': Keyword, + + 'G': Keyword, + 'GENERAL': Keyword, + 'GENERATED': Keyword, + 'GET': Keyword, + 'GLOBAL': Keyword, + 'GO': Keyword, + 'GOTO': Keyword, + 'GRANT': Keyword, + 'GRANTED': Keyword, + 'GROUP': Keyword, + 'GROUPING': Keyword, + + 'HANDLER': Keyword, + 'HAVING': Keyword, + 'HIERARCHY': Keyword, + 'HOLD': Keyword, + 'HOST': Keyword, + + 'IDENTITY': Keyword, + 'IF': Keyword, + 'IGNORE': Keyword, + 'ILIKE': Keyword, + 'IMMEDIATE': Keyword, + 'IMMUTABLE': Keyword, + + 'IMPLEMENTATION': Keyword, + 'IMPLICIT': Keyword, + 'IN': Keyword, + 'INCLUDING': Keyword, + 'INCREMENT': Keyword, + 'INDEX': Keyword, + + 'INDITCATOR': Keyword, + 'INFIX': Keyword, + 'INHERITS': Keyword, + 'INITIALIZE': Keyword, + 'INITIALLY': Keyword, + 'INNER': Keyword, + 'INOUT': Keyword, + 'INPUT': Keyword, + 'INSENSITIVE': Keyword, + 'INSTANTIABLE': Keyword, + 'INSTEAD': Keyword, + 'INTERSECT': Keyword, + 'INTO': Keyword, + 'INVOKER': Keyword, + 'IS': Keyword, + 'ISNULL': Keyword, + 'ISOLATION': Keyword, + 'ITERATE': Keyword, + + 'JOIN': Keyword, + + 'K': Keyword, + 'KEY': Keyword, + 'KEY_MEMBER': Keyword, + 'KEY_TYPE': Keyword, + + 'LANCOMPILER': Keyword, + 'LANGUAGE': Keyword, + 'LARGE': Keyword, + 'LAST': Keyword, + 'LATERAL': Keyword, + 'LEADING': Keyword, + 'LEFT': Keyword, + 'LENGTH': Keyword, + 'LESS': Keyword, + 'LEVEL': Keyword, + 'LIKE': Keyword, + 'LILMIT': Keyword, + 'LISTEN': Keyword, + 'LOAD': Keyword, + 'LOCAL': Keyword, + 'LOCALTIME': Keyword, + 'LOCALTIMESTAMP': Keyword, + 'LOCATION': Keyword, + 'LOCATOR': Keyword, + 'LOCK': Keyword, + 'LOWER': Keyword, + + 'M': Keyword, + 'MAP': Keyword, + 'MATCH': Keyword, + 'MAX': Keyword, + 'MAXVALUE': Keyword, + 'MESSAGE_LENGTH': Keyword, + 'MESSAGE_OCTET_LENGTH': Keyword, + 'MESSAGE_TEXT': Keyword, + 'METHOD': Keyword, + 'MIN': Keyword, + 'MINUTE': Keyword, + 'MINVALUE': Keyword, + 'MOD': Keyword, + 'MODE': Keyword, + 'MODIFIES': Keyword, + 'MODIFY': Keyword, + 'MONTH': Keyword, + 'MORE': Keyword, + 'MOVE': Keyword, + 'MUMPS': Keyword, + + 'NAMES': Keyword, + 'NATIONAL': Keyword, + 'NATURAL': Keyword, + 'NCHAR': Keyword, + 'NCLOB': Keyword, + 'NEW': Keyword, + 'NEXT': Keyword, + 'NO': Keyword, + 'NOCREATEDB': Keyword, + 'NOCREATEUSER': Keyword, + 'NONE': Keyword, + 'NOT': Keyword, + 'NOTHING': Keyword, + 'NOTIFY': Keyword, + 'NOTNULL': Keyword, + 'NULL': Keyword, + 'NULLABLE': Keyword, + 'NULLIF': Keyword, + + 'OBJECT': Keyword, + 'OCTET_LENGTH': Keyword, + 'OF': Keyword, + 'OFF': Keyword, + 'OFFSET': Keyword, + 'OIDS': Keyword, + 'OLD': Keyword, + 'ON': Keyword, + 'ONLY': Keyword, + 'OPEN': Keyword, + 'OPERATION': Keyword, + 'OPERATOR': Keyword, + 'OPTION': Keyword, + 'OPTIONS': Keyword, + 'OR': Keyword, + 'ORDER': Keyword, + 'ORDINALITY': Keyword, + 'OUT': Keyword, + 'OUTER': Keyword, + 'OUTPUT': Keyword, + 'OVERLAPS': Keyword, + 'OVERLAY': Keyword, + 'OVERRIDING': Keyword, + 'OWNER': Keyword, + + 'PAD': Keyword, + 'PARAMETER': Keyword, + 'PARAMETERS': Keyword, + 'PARAMETER_MODE': Keyword, + 'PARAMATER_NAME': Keyword, + 'PARAMATER_ORDINAL_POSITION': Keyword, + 'PARAMETER_SPECIFIC_CATALOG': Keyword, + 'PARAMETER_SPECIFIC_NAME': Keyword, + 'PARAMATER_SPECIFIC_SCHEMA': Keyword, + 'PARTIAL': Keyword, + 'PASCAL': Keyword, + 'PENDANT': Keyword, + 'PLACING': Keyword, + 'PLI': Keyword, + 'POSITION': Keyword, + 'POSTFIX': Keyword, + 'PRECISION': Keyword, + 'PREFIX': Keyword, + 'PREORDER': Keyword, + 'PREPARE': Keyword, + 'PRESERVE': Keyword, + 'PRIMARY': Keyword, + 'PRIOR': Keyword, + 'PRIVILEGES': Keyword, + 'PROCEDURAL': Keyword, + 'PROCEDURE': Keyword, + 'PUBLIC': Keyword, + + 'RAISE': Keyword, + 'READ': Keyword, + 'READS': Keyword, + 'RECHECK': Keyword, + 'RECURSIVE': Keyword, + 'REF': Keyword, + 'REFERENCES': Keyword, + 'REFERENCING': Keyword, + 'REINDEX': Keyword, + 'RELATIVE': Keyword, + 'RENAME': Keyword, + 'REPEATABLE': Keyword, + 'REPLACE': Keyword, + 'RESET': Keyword, + 'RESTART': Keyword, + 'RESTRICT': Keyword, + 'RESULT': Keyword, + 'RETURN': Keyword, + 'RETURNED_LENGTH': Keyword, + 'RETURNED_OCTET_LENGTH': Keyword, + 'RETURNED_SQLSTATE': Keyword, + 'RETURNS': Keyword, + 'REVOKE': Keyword, + 'RIGHT': Keyword, + 'ROLE': Keyword, + 'ROLLBACK': Keyword, + 'ROLLUP': Keyword, + 'ROUTINE': Keyword, + 'ROUTINE_CATALOG': Keyword, + 'ROUTINE_NAME': Keyword, + 'ROUTINE_SCHEMA': Keyword, + 'ROW': Keyword, + 'ROWS': Keyword, + 'ROW_COUNT': Keyword, + 'RULE': Keyword, + + 'SAVE_POINT': Keyword, + 'SCALE': Keyword, + 'SCHEMA': Keyword, + 'SCHEMA_NAME': Keyword, + 'SCOPE': Keyword, + 'SCROLL': Keyword, + 'SEARCH': Keyword, + 'SECOND': Keyword, + 'SECURITY': Keyword, + 'SELF': Keyword, + 'SENSITIVE': Keyword, + 'SERIALIZABLE': Keyword, + 'SERVER_NAME': Keyword, + 'SESSION': Keyword, + 'SESSION_USER': Keyword, + 'SETOF': Keyword, + 'SETS': Keyword, + 'SHARE': Keyword, + 'SHOW': Keyword, + 'SIMILAR': Keyword, + 'SIMPLE': Keyword, + 'SIZE': Keyword, + 'SOME': Keyword, + 'SOURCE': Keyword, + 'SPACE': Keyword, + 'SPECIFIC': Keyword, + 'SPECIFICTYPE': Keyword, + 'SPECIFIC_NAME': Keyword, + 'SQL': Keyword, + 'SQLCODE': Keyword, + 'SQLERROR': Keyword, + 'SQLEXCEPTION': Keyword, + 'SQLSTATE': Keyword, + 'SQLWARNINIG': Keyword, + 'STABLE': Keyword, + 'START': Keyword, + 'STATE': Keyword, + 'STATEMENT': Keyword, + 'STATIC': Keyword, + 'STATISTICS': Keyword, + 'STDIN': Keyword, + 'STDOUT': Keyword, + 'STORAGE': Keyword, + 'STRICT': Keyword, + 'STRUCTURE': Keyword, + 'STYPE': Keyword, + 'SUBCLASS_ORIGIN': Keyword, + 'SUBLIST': Keyword, + 'SUBSTRING': Keyword, + 'SUM': Keyword, + 'SYMMETRIC': Keyword, + 'SYSID': Keyword, + 'SYSTEM': Keyword, + 'SYSTEM_USER': Keyword, + + 'TABLE': Keyword, + 'TABLE_NAME': Keyword, + ' TEMP': Keyword, + 'TEMPLATE': Keyword, + 'TEMPORARY': Keyword, + 'TERMINATE': Keyword, + 'THAN': Keyword, + 'THEN': Keyword, + 'TIMESTAMP': Keyword, + 'TIMEZONE_HOUR': Keyword, + 'TIMEZONE_MINUTE': Keyword, + 'TO': Keyword, + 'TOAST': Keyword, + 'TRAILING': Keyword, + 'TRANSATION': Keyword, + 'TRANSACTIONS_COMMITTED': Keyword, + 'TRANSACTIONS_ROLLED_BACK': Keyword, + 'TRANSATION_ACTIVE': Keyword, + 'TRANSFORM': Keyword, + 'TRANSFORMS': Keyword, + 'TRANSLATE': Keyword, + 'TRANSLATION': Keyword, + 'TREAT': Keyword, + 'TRIGGER': Keyword, + 'TRIGGER_CATALOG': Keyword, + 'TRIGGER_NAME': Keyword, + 'TRIGGER_SCHEMA': Keyword, + 'TRIM': Keyword, + 'TRUE': Keyword, + 'TRUNCATE': Keyword, + 'TRUSTED': Keyword, + 'TYPE': Keyword, + + 'UNCOMMITTED': Keyword, + 'UNDER': Keyword, + 'UNENCRYPTED': Keyword, + 'UNION': Keyword, + 'UNIQUE': Keyword, + 'UNKNOWN': Keyword, + 'UNLISTEN': Keyword, + 'UNNAMED': Keyword, + 'UNNEST': Keyword, + 'UNTIL': Keyword, + 'UPPER': Keyword, + 'USAGE': Keyword, + 'USER': Keyword, + 'USER_DEFINED_TYPE_CATALOG': Keyword, + 'USER_DEFINED_TYPE_NAME': Keyword, + 'USER_DEFINED_TYPE_SCHEMA': Keyword, + 'USING': Keyword, + + 'VACUUM': Keyword, + 'VALID': Keyword, + 'VALIDATOR': Keyword, + 'VALUES': Keyword, + 'VARIABLE': Keyword, + 'VERBOSE': Keyword, + 'VERSION': Keyword, + 'VIEW': Keyword, + 'VOLATILE': Keyword, + + 'WHEN': Keyword, + 'WHENEVER': Keyword, + 'WHERE': Keyword, + 'WITH': Keyword, + 'WITHOUT': Keyword, + 'WORK': Keyword, + 'WRITE': Keyword, + + 'YEAR': Keyword, + + 'ZONE': Keyword, + + + 'ARRAY': Name.Builtin, + 'BIGINT': Name.Builtin, + 'BINARY': Name.Builtin, + 'BIT': Name.Builtin, + 'BLOB': Name.Builtin, + 'BOOLEAN': Name.Builtin, + 'CHAR': Name.Builtin, + 'CHARACTER': Name.Builtin, + 'DATE': Name.Builtin, + 'DEC': Name.Builtin, + 'DECIMAL': Name.Builtin, + 'FLOAT': Name.Builtin, + 'INT': Name.Builtin, + 'INTEGER': Name.Builtin, + 'INTERVAL': Name.Builtin, + 'NUMBER': Name.Builtin, + 'NUMERIC': Name.Builtin, + 'REAL': Name.Builtin, + 'SERIAL': Name.Builtin, + 'SMALLINT': Name.Builtin, + 'VARCHAR': Name.Builtin, + 'VARYING': Name.Builtin, + 'INT8': Name.Builtin, + 'SERIAL8': Name.Builtin, + 'TEXT': Name.Builtin, + } + + +KEYWORDS_COMMON = { + 'SELECT': Keyword.DML, + 'INSERT': Keyword.DML, + 'DELETE': Keyword.DML, + 'UPDATE': Keyword.DML, + 'DROP': Keyword.DDL, + 'CREATE': Keyword.DDL, + 'ALTER': Keyword.DDL, + + 'WHERE': Keyword, + 'FROM': Keyword, + 'INNER': Keyword, + 'JOIN': Keyword, + 'AND': Keyword, + 'OR': Keyword, + 'LIKE': Keyword, + 'ON': Keyword, + 'IN': Keyword, + + 'BY': Keyword, + 'GROUP': Keyword, + 'ORDER': Keyword, + 'LEFT': Keyword, + 'OUTER': Keyword, + + 'IF': Keyword, + 'END': Keyword, + 'THEN': Keyword, + 'LOOP': Keyword, + 'AS': Keyword, + 'ELSE': Keyword, + 'FOR': Keyword, + + 'CASE': Keyword, + 'WHEN': Keyword, + 'MIN': Keyword, + 'MAX': Keyword, + 'DISTINCT': Keyword, + + } diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py new file mode 100644 index 0000000..b635fc6 --- /dev/null +++ b/sqlparse/lexer.py @@ -0,0 +1,310 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +"""SQL Lexer""" + +# This code is based on the SqlLexer in pygments. +# http://pygments.org/ +# It's separated from the rest of pygments to increase performance +# and to allow some customizations. + +import re + +from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON +from sqlparse.tokens import * +from sqlparse.tokens import _TokenType + + +class include(str): + pass + +class combined(tuple): + """Indicates a state combined from multiple states.""" + + def __new__(cls, *args): + return tuple.__new__(cls, args) + + def __init__(self, *args): + # tuple.__init__ doesn't do anything + pass + +def is_keyword(value): + test = value.upper() + return KEYWORDS_COMMON.get(test, KEYWORDS.get(test, Name)), value + + +def apply_filters(stream, filters, lexer=None): + """ + Use this method to apply an iterable of filters to + a stream. If lexer is given it's forwarded to the + filter, otherwise the filter receives `None`. + """ + def _apply(filter_, stream): + for token in filter_.filter(lexer, stream): + yield token + for filter_ in filters: + stream = _apply(filter_, stream) + return stream + + +class LexerMeta(type): + """ + Metaclass for Lexer, creates the self._tokens attribute from + self.tokens on the first instantiation. + """ + + def _process_state(cls, unprocessed, processed, state): + assert type(state) is str, "wrong state name %r" % state + assert state[0] != '#', "invalid state name %r" % state + if state in processed: + return processed[state] + tokens = processed[state] = [] + rflags = cls.flags + for tdef in unprocessed[state]: + if isinstance(tdef, include): + # it's a state reference + assert tdef != state, "circular state reference %r" % state + tokens.extend(cls._process_state(unprocessed, processed, str(tdef))) + continue + + assert type(tdef) is tuple, "wrong rule def %r" % tdef + + try: + rex = re.compile(tdef[0], rflags).match + except Exception, err: + raise ValueError("uncompilable regex %r in state %r of %r: %s" % + (tdef[0], state, cls, err)) + + assert type(tdef[1]) is _TokenType or callable(tdef[1]), \ + 'token type must be simple type or callable, not %r' % (tdef[1],) + + if len(tdef) == 2: + new_state = None + else: + tdef2 = tdef[2] + if isinstance(tdef2, str): + # an existing state + if tdef2 == '#pop': + new_state = -1 + elif tdef2 in unprocessed: + new_state = (tdef2,) + elif tdef2 == '#push': + new_state = tdef2 + elif tdef2[:5] == '#pop:': + new_state = -int(tdef2[5:]) + else: + assert False, 'unknown new state %r' % tdef2 + elif isinstance(tdef2, combined): + # combine a new state from existing ones + new_state = '_tmp_%d' % cls._tmpname + cls._tmpname += 1 + itokens = [] + for istate in tdef2: + assert istate != state, 'circular state ref %r' % istate + itokens.extend(cls._process_state(unprocessed, + processed, istate)) + processed[new_state] = itokens + new_state = (new_state,) + elif isinstance(tdef2, tuple): + # push more than one state + for state in tdef2: + assert (state in unprocessed or + state in ('#pop', '#push')), \ + 'unknown new state ' + state + new_state = tdef2 + else: + assert False, 'unknown new state def %r' % tdef2 + tokens.append((rex, tdef[1], new_state)) + return tokens + + def process_tokendef(cls): + cls._all_tokens = {} + cls._tmpname = 0 + processed = cls._all_tokens[cls.__name__] = {} + #tokendefs = tokendefs or cls.tokens[name] + for state in cls.tokens.keys(): + cls._process_state(cls.tokens, processed, state) + return processed + + def __call__(cls, *args, **kwds): + if not hasattr(cls, '_tokens'): + cls._all_tokens = {} + cls._tmpname = 0 + if hasattr(cls, 'token_variants') and cls.token_variants: + # don't process yet + pass + else: + cls._tokens = cls.process_tokendef() + + return type.__call__(cls, *args, **kwds) + + + + +class Lexer: + + __metaclass__ = LexerMeta + + encoding = 'utf-8' + stripall = False + stripnl = False + tabsize = 0 + flags = re.IGNORECASE + + tokens = { + 'root': [ + (r'--.*?(\r|\n|\r\n)', Comment.Single), + (r'(\r|\n|\r\n)', Newline), + (r'\s+', Whitespace), + (r'/\*', Comment.Multiline, 'multiline-comments'), + (r':=', Assignment), + (r'::', Punctuation), + (r'[*]', Wildcard), + (r'[+/<>=~!@#%^&|`?^-]', Operator), + (r'[0-9]+', Number.Integer), + # TODO: Backslash escapes? + (r"'(''|[^'])*'", String.Single), + (r'"(""|[^"])*"', String.Symbol), # not a real string literal in ANSI SQL + (r'(LEFT |RIGHT )?(INNER |OUTER )?JOIN', Keyword), + (r'END( IF| LOOP)?', Keyword), + (r'CREATE( OR REPLACE)?', Keyword.DDL), + (r'[a-zA-Z_][a-zA-Z0-9_]*', is_keyword), + (r'\$([a-zA-Z_][a-zA-Z0-9_]*)?\$', Name.Builtin), + (r'[;:()\[\],\.]', Punctuation), + ], + 'multiline-comments': [ + (r'/\*', Comment.Multiline, 'multiline-comments'), + (r'\*/', Comment.Multiline, '#pop'), + (r'[^/\*]+', Comment.Multiline), + (r'[/*]', Comment.Multiline) + ] + } + + def __init__(self): + self.filters = [] + + def add_filter(self, filter_, **options): + from sqlparse.filters import Filter + if not isinstance(filter_, Filter): + filter_ = filter_(**options) + self.filters.append(filter_) + + def get_tokens(self, text, unfiltered=False): + """ + Return an iterable of (tokentype, value) pairs generated from + `text`. If `unfiltered` is set to `True`, the filtering mechanism + is bypassed even if filters are defined. + + Also preprocess the text, i.e. expand tabs and strip it if + wanted and applies registered filters. + """ + if not isinstance(text, unicode): + if self.encoding == 'guess': + try: + text = text.decode('utf-8') + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] + except UnicodeDecodeError: + text = text.decode('latin1') + elif self.encoding == 'chardet': + try: + import chardet + except ImportError: + raise ImportError('To enable chardet encoding guessing, ' + 'please install the chardet library ' + 'from http://chardet.feedparser.org/') + enc = chardet.detect(text) + text = text.decode(enc['encoding']) + else: + text = text.decode(self.encoding) + if self.stripall: + text = text.strip() + elif self.stripnl: + text = text.strip('\n') + if self.tabsize > 0: + text = text.expandtabs(self.tabsize) +# if not text.endswith('\n'): +# text += '\n' + + def streamer(): + for i, t, v in self.get_tokens_unprocessed(text): + yield t, v + stream = streamer() + if not unfiltered: + stream = apply_filters(stream, self.filters, self) + return stream + + + def get_tokens_unprocessed(self, text, stack=('root',)): + """ + Split ``text`` into (tokentype, text) pairs. + + ``stack`` is the inital stack (default: ``['root']``) + """ + pos = 0 + tokendefs = self._tokens + statestack = list(stack) + statetokens = tokendefs[statestack[-1]] + known_names = {} + while 1: + for rexmatch, action, new_state in statetokens: + m = rexmatch(text, pos) + if m: + # print rex.pattern + value = m.group() + if value in known_names: + yield pos, known_names[value], value + elif type(action) is _TokenType: + yield pos, action, value + elif hasattr(action, '__call__'): + ttype, value = action(value) + known_names[value] = ttype + yield pos, ttype, value + else: + for item in action(self, m): + yield item + pos = m.end() + if new_state is not None: + # state transition + if isinstance(new_state, tuple): + for state in new_state: + if state == '#pop': + statestack.pop() + elif state == '#push': + statestack.append(statestack[-1]) + else: + statestack.append(state) + elif isinstance(new_state, int): + # pop + del statestack[new_state:] + elif new_state == '#push': + statestack.append(statestack[-1]) + else: + assert False, "wrong state def: %r" % new_state + statetokens = tokendefs[statestack[-1]] + break + else: + try: + if text[pos] == '\n': + # at EOL, reset state to "root" + pos += 1 + statestack = ['root'] + statetokens = tokendefs['root'] + yield pos, Text, u'\n' + continue + yield pos, Error, text[pos] + pos += 1 + except IndexError: + break + + +def tokenize(sql): + """Tokenize sql. + + Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream + of ``(token type, value)`` items. + """ + lexer = Lexer() + return lexer.get_tokens(sql) diff --git a/sqlparse/tokens.py b/sqlparse/tokens.py new file mode 100644 index 0000000..2c63c41 --- /dev/null +++ b/sqlparse/tokens.py @@ -0,0 +1,131 @@ +# Copyright (C) 2008 Andi Albrecht, albrecht.andi@gmail.com +# +# This module is part of python-sqlparse and is released under +# the BSD License: http://www.opensource.org/licenses/bsd-license.php. + +# The Token implementation is based on pygment's token system written +# by Georg Brandl. +# http://pygments.org/ + +"""Tokens""" + +try: + set +except NameError: + from sets import Set as set + + +class _TokenType(tuple): + parent = None + + def split(self): + buf = [] + node = self + while node is not None: + buf.append(node) + node = node.parent + buf.reverse() + return buf + + def __init__(self, *args): + # no need to call super.__init__ + self.subtypes = set() + + def __contains__(self, val): + return self is val or ( + type(val) is self.__class__ and + val[:len(self)] == self + ) + + def __getattr__(self, val): + if not val or not val[0].isupper(): + return tuple.__getattribute__(self, val) + new = _TokenType(self + (val,)) + setattr(self, val, new) + self.subtypes.add(new) + new.parent = self + return new + + def __hash__(self): + return hash(tuple(self)) + + def __repr__(self): + return 'Token' + (self and '.' or '') + '.'.join(self) + + +Token = _TokenType() + +# Special token types +Text = Token.Text +Whitespace = Text.Whitespace +Newline = Whitespace.Newline +Error = Token.Error +# Text that doesn't belong to this lexer (e.g. HTML in PHP) +Other = Token.Other + +# Common token types for source code +Keyword = Token.Keyword +Name = Token.Name +Literal = Token.Literal +String = Literal.String +Number = Literal.Number +Punctuation = Token.Punctuation +Operator = Token.Operator +Wildcard = Token.Wildcard +Comment = Token.Comment +Assignment = Token.Assignement + +# Generic types for non-source code +Generic = Token.Generic + +# String and some others are not direct childs of Token. +# alias them: +Token.Token = Token +Token.String = String +Token.Number = Number + +# SQL specific tokens +DML = Keyword.DML +DDL = Keyword.DDL +Command = Keyword.Command + +Group = Token.Group +Group.Parenthesis = Token.Group.Parenthesis +Group.Comment = Token.Group.Comment +Group.Where = Token.Group.Where + + +def is_token_subtype(ttype, other): + """ + Return True if ``ttype`` is a subtype of ``other``. + + exists for backwards compatibility. use ``ttype in other`` now. + """ + return ttype in other + + +def string_to_tokentype(s): + """ + Convert a string into a token type:: + + >>> string_to_token('String.Double') + Token.Literal.String.Double + >>> string_to_token('Token.Literal.Number') + Token.Literal.Number + >>> string_to_token('') + Token + + Tokens that are already tokens are returned unchanged: + + >>> string_to_token(String) + Token.Literal.String + """ + if isinstance(s, _TokenType): + return s + if not s: + return Token + node = Token + for item in s.split('.'): + node = getattr(node, item) + return node + |