diff options
author | Michael Smedberg <msmedberg@zendesk.com> | 2019-08-04 20:43:55 -0700 |
---|---|---|
committer | Paul McGuire <ptmcg@users.noreply.github.com> | 2019-08-04 22:43:55 -0500 |
commit | b0f76d82a134d2ea2324b384ac9eba6c50a516d3 (patch) | |
tree | 2f9cab12267e8b160ba26ef24019f99b66200c7c | |
parent | e0db26b5cbc05df73f7917ffeb8f1d3144e74ec1 (diff) | |
download | pyparsing-git-b0f76d82a134d2ea2324b384ac9eba6c50a516d3.tar.gz |
Example BigQuery view SQL parser (#112)
Example BigQuery view SQL parser
-rw-r--r-- | examples/bigquery_view_parser.py | 1510 |
1 files changed, 1510 insertions, 0 deletions
diff --git a/examples/bigquery_view_parser.py b/examples/bigquery_view_parser.py new file mode 100644 index 0000000..6172481 --- /dev/null +++ b/examples/bigquery_view_parser.py @@ -0,0 +1,1510 @@ +# bigquery_view_parser.py +# +# A parser to extract table names from BigQuery view definitions. +# This is based on the `select_parser.py` sample in pyparsing: +# https://github.com/pyparsing/pyparsing/blob/master/examples/select_parser.py +# +# Michael Smedberg +# + +from pyparsing import ParserElement, Suppress, Forward, CaselessKeyword +from pyparsing import MatchFirst, alphas, alphanums, Combine, Word +from pyparsing import QuotedString, CharsNotIn, Optional, Group, ZeroOrMore +from pyparsing import oneOf, delimitedList, restOfLine, cStyleComment +from pyparsing import infixNotation, opAssoc, OneOrMore, Regex, nums + + +class BigQueryViewParser: + """Parser to extract table info from BigQuery view definitions""" + _parser = None + _table_identifiers = set() + _with_aliases = set() + + def get_table_names(self, sql_stmt): + table_identifiers, with_aliases = self._parse(sql_stmt) + + # Table names and alias names might differ by case, but that's not + # relevant- aliases are not case sensitive + lower_aliases = BigQueryViewParser.lowercase_set_of_tuples(with_aliases) + tables = set([ + x for x in table_identifiers + if not BigQueryViewParser.lowercase_of_tuple(x) in lower_aliases + ]) + + # Table names ARE case sensitive as described at + # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity + return tables + + def _parse(self, sql_stmt): + BigQueryViewParser._table_identifiers.clear() + BigQueryViewParser._with_aliases.clear() + BigQueryViewParser._get_parser().parseString(sql_stmt) + + return (BigQueryViewParser._table_identifiers, BigQueryViewParser._with_aliases) + + @classmethod + def lowercase_of_tuple(cls, tuple_to_lowercase): + return tuple(x.lower() if x else None for x in tuple_to_lowercase) + + @classmethod + def lowercase_set_of_tuples(cls, set_of_tuples): + return set([BigQueryViewParser.lowercase_of_tuple(x) for x in set_of_tuples]) + + @classmethod + def _get_parser(cls): + if cls._parser is not None: + return cls._parser + + ParserElement.enablePackrat() + + LPAR, RPAR, COMMA, LBRACKET, RBRACKET, LT, GT = map(Suppress, "(),[]<>") + ungrouped_select_stmt = Forward().setName("select statement") + + # keywords + ( + UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, + NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, + NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, + LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, + END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, + MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH, + EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT, + ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE, + SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE, + NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, + DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, + DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, + GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, + ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF, + LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR, + COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP, + VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY, + GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS, + OF, WINDOW + ) = map(CaselessKeyword, + """ + UNION, ALL, AND, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, + NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, + NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, + LIMIT, OFFSET, OR, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, + END, CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, + MATCH, ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, WITH, + EXTRACT, PARTITION, ROWS, RANGE, UNBOUNDED, PRECEDING, CURRENT, + ROW, FOLLOWING, OVER, INTERVAL, DATE_ADD, DATE_SUB, ADDDATE, + SUBDATE, REGEXP_EXTRACT, SPLIT, ORDINAL, FIRST_VALUE, LAST_VALUE, + NTH_VALUE, LEAD, LAG, PERCENTILE_CONT, PRECENTILE_DISC, RANK, + DENSE_RANK, PERCENT_RANK, CUME_DIST, NTILE, ROW_NUMBER, DATE, TIME, + DATETIME, TIMESTAMP, UNNEST, INT64, NUMERIC, FLOAT64, BOOL, BYTES, + GEOGRAPHY, ARRAY, STRUCT, SAFE_CAST, ANY_VALUE, ARRAY_AGG, + ARRAY_CONCAT_AGG, AVG, BIT_AND, BIT_OR, BIT_XOR, COUNT, COUNTIF, + LOGICAL_AND, LOGICAL_OR, MAX, MIN, STRING_AGG, SUM, CORR, + COVAR_POP, COVAR_SAMP, STDDEV_POP, STDDEV_SAMP, STDDEV, VAR_POP, + VAR_SAMP, VARIANCE, TIMESTAMP_ADD, TIMESTAMP_SUB, GENERATE_ARRAY, + GENERATE_DATE_ARRAY, GENERATE_TIMESTAMP_ARRAY, FOR, SYSTEMTIME, AS, + OF, WINDOW + """.replace(",", "").split()) + + keyword_nonfunctions = MatchFirst(( + UNION, ALL, INTERSECT, EXCEPT, COLLATE, ASC, DESC, ON, USING, + NATURAL, INNER, CROSS, LEFT, RIGHT, OUTER, FULL, JOIN, AS, INDEXED, + NOT, SELECT, DISTINCT, FROM, WHERE, GROUP, BY, HAVING, ORDER, BY, + LIMIT, OFFSET, CAST, ISNULL, NOTNULL, NULL, IS, BETWEEN, ELSE, END, + CASE, WHEN, THEN, EXISTS, COLLATE, IN, LIKE, GLOB, REGEXP, MATCH, + STRUCT, WINDOW)) + + keyword = keyword_nonfunctions | MatchFirst(( + ESCAPE, CURRENT_TIME, CURRENT_DATE, CURRENT_TIMESTAMP, DATE_ADD, + DATE_SUB, ADDDATE, SUBDATE, INTERVAL, STRING_AGG, REGEXP_EXTRACT, + SPLIT, ORDINAL, UNNEST, SAFE_CAST, PARTITION, TIMESTAMP_ADD, + TIMESTAMP_SUB, ARRAY, GENERATE_ARRAY, GENERATE_DATE_ARRAY, + GENERATE_TIMESTAMP_ARRAY)) + + identifier_word = Word(alphas + '_@#', alphanums + '@$#_') + identifier = ~keyword + identifier_word.copy() + collation_name = identifier.copy() + # NOTE: Column names can be keywords. Doc says they cannot, but in practice it seems to work. + column_name = identifier_word.copy() + qualified_column_name = Combine(column_name + ('.' + column_name) * (0, 6)) + # NOTE: As with column names, column aliases can be keywords, e.g. functions like `current_time`. Other + # keywords, e.g. `from` make parsing pretty difficult (e.g. "SELECT a from from b" is confusing.) + column_alias = ~keyword_nonfunctions + column_name.copy() + table_name = identifier.copy() + table_alias = identifier.copy() + index_name = identifier.copy() + function_name = identifier.copy() + parameter_name = identifier.copy() + # NOTE: The expression in a CASE statement can be an integer. E.g. this is valid SQL: + # select CASE 1 WHEN 1 THEN -1 ELSE -2 END from test_table + unquoted_case_identifier = ~keyword + Word(alphanums + '$_') + quoted_case_identifier = ~keyword + ( + QuotedString('"') ^ Suppress('`') + + CharsNotIn('`') + Suppress('`') + ) + case_identifier = (quoted_case_identifier | unquoted_case_identifier) + case_expr = ( + Optional(case_identifier + Suppress('.')) + + Optional(case_identifier + Suppress('.')) + + case_identifier + ) + + # expression + expr = Forward().setName("expression") + + integer = Regex(r"[+-]?\d+") + numeric_literal = Regex(r"[+-]?\d*\.?\d+([eE][+-]?\d+)?") + string_literal = ( + QuotedString("'") + | QuotedString('"') + | QuotedString('`') + ) + regex_literal = "r" + string_literal + blob_literal = Regex(r"[xX]'[0-9A-Fa-f]+'") + date_or_time_literal = ( + (DATE | TIME | DATETIME | TIMESTAMP) + + string_literal + ) + literal_value = ( + numeric_literal | string_literal | regex_literal + | blob_literal | date_or_time_literal | NULL + | CURRENT_TIME + Optional(LPAR + Optional(string_literal) + RPAR) + | CURRENT_DATE + Optional(LPAR + Optional(string_literal) + RPAR) + | CURRENT_TIMESTAMP + + Optional(LPAR + Optional(string_literal) + RPAR) + ) + bind_parameter = ( + Word("?", nums) + | Combine(oneOf(": @ $") + parameter_name) + ) + type_name = oneOf("""TEXT REAL INTEGER BLOB NULL TIMESTAMP STRING DATE + INT64 NUMERIC FLOAT64 BOOL BYTES DATETIME GEOGRAPHY TIME ARRAY + STRUCT""", caseless=True) + date_part = oneOf("""DAY DAY_HOUR DAY_MICROSECOND DAY_MINUTE DAY_SECOND + HOUR HOUR_MICROSECOND HOUR_MINUTE HOUR_SECOND MICROSECOND MINUTE + MINUTE_MICROSECOND MINUTE_SECOND MONTH QUARTER SECOND + SECOND_MICROSECOND WEEK YEAR YEAR_MONTH""", caseless=True) + datetime_operators = ( + DATE_ADD | DATE_SUB | ADDDATE | SUBDATE | TIMESTAMP_ADD + | TIMESTAMP_SUB + ) + + grouping_term = expr.copy() + ordering_term = Group( + expr('order_key') + + Optional(COLLATE + collation_name('collate')) + + Optional(ASC | DESC)('direction') + )("ordering_term") + + function_arg = expr.copy()("function_arg") + function_args = Optional( + "*" + | Optional(DISTINCT) + delimitedList(function_arg) + )("function_args") + function_call = ( + (function_name | keyword)("function_name") + + LPAR + Group(function_args)("function_args_group") + RPAR + ) + + navigation_function_name = ( + FIRST_VALUE | LAST_VALUE | NTH_VALUE | LEAD | LAG + | PERCENTILE_CONT | PRECENTILE_DISC + ) + aggregate_function_name = ( + ANY_VALUE | ARRAY_AGG | ARRAY_CONCAT_AGG | AVG | BIT_AND | BIT_OR + | BIT_XOR | COUNT | COUNTIF | LOGICAL_AND | LOGICAL_OR | MAX | MIN + | STRING_AGG | SUM + ) + statistical_aggregate_function_name = ( + CORR | COVAR_POP | COVAR_SAMP | STDDEV_POP | STDDEV_SAMP | STDDEV + | VAR_POP | VAR_SAMP | VARIANCE + ) + numbering_function_name = ( + RANK | DENSE_RANK | PERCENT_RANK | CUME_DIST | NTILE | ROW_NUMBER) + analytic_function_name = ( + navigation_function_name + | aggregate_function_name + | statistical_aggregate_function_name + | numbering_function_name + )("analytic_function_name") + partition_expression_list = delimitedList( + grouping_term + )("partition_expression_list") + window_frame_boundary_start = ( + UNBOUNDED + PRECEDING + | numeric_literal + (PRECEDING | FOLLOWING) + | CURRENT + ROW + ) + window_frame_boundary_end = ( + UNBOUNDED + FOLLOWING + | numeric_literal + (PRECEDING | FOLLOWING) + | CURRENT + ROW + ) + window_frame_clause = (ROWS | RANGE) + ( + ( + (UNBOUNDED + PRECEDING) + | (numeric_literal + PRECEDING) + | (CURRENT + ROW) + ) | + ( + BETWEEN + window_frame_boundary_start + + AND + window_frame_boundary_end + )) + window_name = identifier.copy()("window_name") + window_specification = ( + Optional(window_name) + + Optional(PARTITION + BY + partition_expression_list) + + Optional(ORDER + BY + delimitedList(ordering_term)) + + Optional(window_frame_clause)("window_specification") + ) + analytic_function = ( + analytic_function_name + + LPAR + function_args + RPAR + + OVER + (window_name | LPAR + Optional(window_specification) + RPAR) + )("analytic_function") + + string_agg_term = ( + STRING_AGG + + LPAR + + Optional(DISTINCT) + + expr + + Optional(COMMA + string_literal) + + Optional( + ORDER + BY + expr + + Optional(ASC | DESC) + + Optional(LIMIT + integer) + ) + + RPAR + )("string_agg") + array_literal = ( + Optional(ARRAY + Optional(LT + delimitedList(type_name) + GT)) + + LBRACKET + + delimitedList(expr) + + RBRACKET + ) + interval = INTERVAL + expr + date_part + array_generator = ( + GENERATE_ARRAY + + LPAR + + numeric_literal + + COMMA + + numeric_literal + + COMMA + + numeric_literal + + RPAR + ) + date_array_generator = ( + (GENERATE_DATE_ARRAY | GENERATE_TIMESTAMP_ARRAY) + + LPAR + + expr("start_date") + + COMMA + + expr("end_date") + + Optional(COMMA + interval) + + RPAR + ) + + explicit_struct = ( + STRUCT + + Optional(LT + delimitedList(type_name) + GT) + + LPAR + + Optional(delimitedList(expr + Optional(AS + identifier))) + + RPAR + ) + + case_when = WHEN + expr.copy()("when") + case_then = THEN + expr.copy()("then") + case_clauses = Group(ZeroOrMore((case_when + case_then))) + case_else = ELSE + expr.copy()("else") + case_stmt = ( + CASE + + Optional(case_expr.copy()) + + case_clauses("case_clauses") + + Optional(case_else) + END + )("case") + + expr_term = ( + (analytic_function)("analytic_function") + | (CAST + LPAR + expr + AS + type_name + RPAR)("cast") + | (SAFE_CAST + LPAR + expr + AS + type_name + RPAR)("safe_cast") + | ( + Optional(EXISTS) + + LPAR + ungrouped_select_stmt + RPAR + )("subselect") + | (literal_value)("literal") + | (bind_parameter)("bind_parameter") + | (EXTRACT + LPAR + expr + FROM + expr + RPAR)("extract") + | case_stmt + | ( + datetime_operators + + LPAR + expr + COMMA + interval + RPAR + )("date_operation") + | string_agg_term("string_agg_term") + | array_literal("array_literal") + | array_generator("array_generator") + | date_array_generator("date_array_generator") + | explicit_struct("explicit_struct") + | function_call("function_call") + | qualified_column_name("column") + ) + Optional( + LBRACKET + + (OFFSET | ORDINAL) + + LPAR + expr + RPAR + + RBRACKET + )("offset_ordinal") + + struct_term = (LPAR + delimitedList(expr_term) + RPAR) + + UNARY, BINARY, TERNARY = 1, 2, 3 + expr << infixNotation((expr_term | struct_term), [ + (oneOf('- + ~') | NOT, UNARY, opAssoc.RIGHT), + (ISNULL | NOTNULL | NOT + NULL, UNARY, opAssoc.LEFT), + ('||', BINARY, opAssoc.LEFT), + (oneOf('* / %'), BINARY, opAssoc.LEFT), + (oneOf('+ -'), BINARY, opAssoc.LEFT), + (oneOf('<< >> & |'), BINARY, opAssoc.LEFT), + (oneOf("= > < >= <= <> != !< !>"), BINARY, opAssoc.LEFT), + ( + IS + Optional(NOT) + | Optional(NOT) + IN + | Optional(NOT) + LIKE + | GLOB + | MATCH + | REGEXP, BINARY, opAssoc.LEFT + ), + ((BETWEEN, AND), TERNARY, opAssoc.LEFT), + ( + Optional(NOT) + IN + + LPAR + + Group(ungrouped_select_stmt | delimitedList(expr)) + + RPAR, + UNARY, + opAssoc.LEFT + ), + (AND, BINARY, opAssoc.LEFT), + (OR, BINARY, opAssoc.LEFT), + ]) + quoted_expr = ( + expr + ^ Suppress('"') + expr + Suppress('"') + ^ Suppress("'") + expr + Suppress("'") + ^ Suppress('`') + expr + Suppress('`') + )("quoted_expr") + + compound_operator = ( + UNION + Optional(ALL | DISTINCT) + | INTERSECT + DISTINCT + | EXCEPT + DISTINCT + | INTERSECT + | EXCEPT + )("compound_operator") + + join_constraint = Group( + Optional( + ON + expr + | USING + LPAR + + Group(delimitedList(qualified_column_name)) + + RPAR + ))("join_constraint") + + join_op = ( + COMMA + | Group( + Optional(NATURAL) + + Optional( + INNER + | CROSS + | LEFT + OUTER + | LEFT + | RIGHT + OUTER + | RIGHT + | FULL + OUTER + | OUTER + | FULL + ) + JOIN + ) + )("join_op") + + join_source = Forward() + + # We support three kinds of table identifiers. + # + # First, dot delimited info like project.dataset.table, where + # each component follows the rules described in the BigQuery + # docs, namely: + # Contain letters (upper or lower case), numbers, and underscores + # + # Second, a dot delimited quoted string. Since it's quoted, we'll be + # liberal w.r.t. what characters we allow. E.g.: + # `project.dataset.name-with-dashes` + # + # Third, a series of quoted strings, delimited by dots, e.g.: + # `project`.`dataset`.`name-with-dashes` + # + # We won't attempt to support combinations, like: + # project.dataset.`name-with-dashes` + # `project`.`dataset.name-with-dashes` + + def record_table_identifier(t): + identifier_list = t.asList() + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._table_identifiers.add(tuple(padded_list)) + + standard_table_part = ~keyword + Word(alphanums + "_") + standard_table_identifier = ( + Optional(standard_table_part("project") + Suppress('.')) + + Optional(standard_table_part("dataset") + Suppress('.')) + + standard_table_part("table") + ).setParseAction(lambda t: record_table_identifier(t)) + + quoted_project_part = ( + Suppress('"') + CharsNotIn('"') + Suppress('"') + | Suppress("'") + CharsNotIn("'") + Suppress("'") + | Suppress("`") + CharsNotIn("`") + Suppress("`") + ) + quoted_table_part = ( + Suppress('"') + CharsNotIn('".') + Suppress('"') + | Suppress("'") + CharsNotIn("'.") + Suppress("'") + | Suppress("`") + CharsNotIn("`.") + Suppress("`") + ) + quoted_table_parts_identifier = ( + Optional(quoted_project_part("project") + Suppress('.')) + + Optional(quoted_table_part("dataset") + Suppress('.')) + + quoted_table_part("table") + ).setParseAction(lambda t: record_table_identifier(t)) + + def record_quoted_table_identifier(t): + identifier_list = t.asList()[0].split('.') + first = ".".join(identifier_list[0:-2]) or None + second = identifier_list[-2] + third = identifier_list[-1] + identifier_list = [first, second, third] + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._table_identifiers.add(tuple(padded_list)) + + quotable_table_parts_identifier = ( + Suppress('"') + CharsNotIn('"') + Suppress('"') + | Suppress("'") + CharsNotIn("'") + Suppress("'") + | Suppress("`") + CharsNotIn("`") + Suppress("`") + ).setParseAction(lambda t: record_quoted_table_identifier(t)) + + table_identifier = ( + standard_table_identifier | + quoted_table_parts_identifier | + quotable_table_parts_identifier + ) + + single_source = ( + table_identifier + + Optional(Optional(AS) + table_alias("table_alias*")) + + Optional(FOR + SYSTEMTIME + AS + OF + string_literal) + + Optional( + INDEXED + BY + index_name("name") + | NOT + INDEXED + )("index") + | ( + LPAR + + ungrouped_select_stmt + + RPAR + + Optional(Optional(AS) + table_alias) + ) + | (LPAR + join_source + RPAR) + | (UNNEST + LPAR + expr + RPAR) + + Optional(Optional(AS) + column_alias) + ) + + join_source << ( + Group( + single_source + + OneOrMore(join_op + single_source + join_constraint) + ) + | single_source + ) + + over_partition = ( + PARTITION + BY + + delimitedList(partition_expression_list) + )("over_partition") + over_order = (ORDER + BY + delimitedList(ordering_term)) + over_unsigned_value_specification = expr + over_window_frame_preceding = ( + UNBOUNDED + PRECEDING + | over_unsigned_value_specification + PRECEDING + | CURRENT + ROW + ) + over_window_frame_following = ( + UNBOUNDED + FOLLOWING + | over_unsigned_value_specification + FOLLOWING + | CURRENT + ROW + ) + over_window_frame_bound = ( + over_window_frame_preceding + | over_window_frame_following + ) + over_window_frame_between = ( + BETWEEN + over_window_frame_bound + AND + over_window_frame_bound + ) + over_window_frame_extent = ( + over_window_frame_preceding + | over_window_frame_between + ) + over_row_or_range = ((ROWS | RANGE) + over_window_frame_extent) + over = ( + OVER + + LPAR + + Optional(over_partition) + + Optional(over_order) + + Optional(over_row_or_range) + + RPAR + )("over") + + result_column = ( + Optional(table_name + ".") + + "*" + + Optional(EXCEPT + LPAR + delimitedList(column_name) + RPAR) + | Group( + quoted_expr + + Optional(over) + + Optional(Optional(AS) + column_alias) + ) + ) + + select_core = ( + SELECT + + Optional(DISTINCT | ALL) + + Group(delimitedList(result_column))("columns") + + Optional(FROM + join_source("from*")) + + Optional(WHERE + expr) + + Optional( + GROUP + BY + + Group(delimitedList(grouping_term))("group_by_terms") + ) + + Optional(HAVING + expr("having_expr")) + + Optional( + ORDER + BY + + Group(delimitedList(ordering_term))("order_by_terms") + ) + ) + grouped_select_core = select_core | (LPAR + select_core + RPAR) + + ungrouped_select_stmt << ( + grouped_select_core + + ZeroOrMore(compound_operator + grouped_select_core) + + Optional( + LIMIT + + ( + Group(expr + OFFSET + expr) + | Group(expr + COMMA + expr) + | expr + )("limit") + ) + )("select") + select_stmt = ungrouped_select_stmt | (LPAR + ungrouped_select_stmt + RPAR) + + # define comment format, and ignore them + sql_comment = (oneOf("-- #") + restOfLine | cStyleComment) + select_stmt.ignore(sql_comment) + + def record_with_alias(t): + identifier_list = t.asList() + padded_list = [None] * (3 - len(identifier_list)) + identifier_list + cls._with_aliases.add(tuple(padded_list)) + + with_stmt = Forward().setName("with statement") + with_clause = Group( + identifier.setParseAction(lambda t: record_with_alias(t)) + + AS + + LPAR + (select_stmt | with_stmt) + RPAR + ) + with_core = (WITH + delimitedList(with_clause)) + with_stmt << (with_core + ungrouped_select_stmt) + with_stmt.ignore(sql_comment) + + select_or_with = (select_stmt | with_stmt) + select_or_with_parens = LPAR + select_or_with + RPAR + + cls._parser = (select_or_with | select_or_with_parens) + return cls._parser + + TEST_CASES = [ + [ + """ + SELECT x FROM y.a, b + """, + [ + (None, "y", "a"), + (None, None, "b",), + ] + ], + [ + """ + SELECT x FROM y.a JOIN b + """, + [ + (None, "y", "a"), + (None, None, "b"), + ] + ], + [ + """ + select * from xyzzy where z > 100 + """, + [ + (None, None, "xyzzy"), + ] + ], + [ + """ + select * from xyzzy where z > 100 order by zz + """, + [ + (None, None, "xyzzy"), + ] + ], + [ + """ + select * from xyzzy + """, + [ + (None, None, "xyzzy",), + ] + ], + [ + """ + select z.* from xyzzy + """, + [ + (None, None, "xyzzy",), + ] + ], + [ + """ + select a, b from test_table where 1=1 and b='yes' + """, + [ + (None, None, "test_table"), + ] + ], + [ + """ + select a, b from test_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ] + ], + [ + """ + select z.a, b from test_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ] + ], + [ + """ + select z.a, b from test_table where 1=1 and b in (select bb from foo) order by b,c desc,d + """, + [ + (None, None, "test_table"), + (None, None, "foo"), + ] + ], + [ + """ + select z.a, b from test_table left join test2_table where 1=1 and b in (select bb from foo) + """, + [ + (None, None, "test_table"), + (None, None, "test2_table"), + (None, None, "foo"), + ] + ], + [ + """ + select a, db.table.b as BBB from db.table where 1=1 and BBB='yes' + """, + [ + (None, "db", "table"), + ] + ], + [ + """ + select a, db.table.b as BBB from test_table,db.table where 1=1 and BBB='yes' + """, + [ + (None, None, "test_table"), + (None, "db", "table"), + ] + ], + [ + """ + select a, db.table.b as BBB from test_table,db.table where 1=1 and BBB='yes' limit 50 + """, + [ + (None, None, "test_table"), + (None, "db", "table"), + ] + ], + [ + """ + select a, b from test_table where (1=1 or 2=3) and b='yes' group by zx having b=2 order by 1 + """, + [ + (None, None, "test_table"), + ] + ], + [ + """ + select + a, + b + # this is a comment + from + test_table + # another comment + where (1=1 or 2=3) and b='yes' + #yup, a comment + group by zx having b=2 order by 1 + """, + [ + (None, None, "test_table"), + ] + ], + [ + """ + SELECT COUNT(DISTINCT foo) FROM bar JOIN baz ON bar.baz_id = baz.id + """, + [ + (None, None, "bar"), + (None, None, "baz"), + ] + ], + [ + """ + SELECT COUNT(DISTINCT foo) FROM bar, baz WHERE bar.baz_id = baz.id + """, + [ + (None, None, "bar"), + (None, None, "baz"), + ] + ], + [ + """ + WITH one AS (SELECT id FROM foo) SELECT one.id + """, + [ + (None, None, "foo"), + ] + ], + [ + """ + WITH one AS (SELECT id FROM foo), two AS (select id FROM bar) SELECT one.id, two.id + """, + [ + (None, None, "foo"), + (None, None, "bar"), + ] + ], + [ + """ + SELECT x, + RANK() OVER (ORDER BY x ASC) AS rank, + DENSE_RANK() OVER (ORDER BY x ASC) AS dense_rank, + ROW_NUMBER() OVER (PARTITION BY x ORDER BY y) AS row_num + FROM a + """, + [ + (None, None, "a",), + ] + ], + [ + """ + SELECT x, COUNT(*) OVER ( ORDER BY x + RANGE BETWEEN 2 PRECEDING AND 2 FOLLOWING ) AS count_x + FROM T + """, + [ + (None, None, "T",), + ] + ], + [ + """ + SELECT firstname, department, startdate, + RANK() OVER ( PARTITION BY department ORDER BY startdate ) AS rank + FROM Employees + """, + [ + (None, None, "Employees"), + ] + ], + + # A fragment from https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34' + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', fastest_time) AS fastest_time, + TIMESTAMP_DIFF(finish_time, fastest_time, SECOND) AS delta_in_seconds + FROM ( + SELECT name, + finish_time, + division, + FIRST_VALUE(finish_time) + OVER (PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS fastest_time + FROM finishers) + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', slowest_time) AS slowest_time, + TIMESTAMP_DIFF(slowest_time, finish_time, SECOND) AS delta_in_seconds + FROM ( + SELECT name, + finish_time, + division, + LAST_VALUE(finish_time) + OVER (PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS slowest_time + FROM finishers) + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + FORMAT_TIMESTAMP('%X', finish_time) AS finish_time, + division, + FORMAT_TIMESTAMP('%X', fastest_time) AS fastest_time, + FORMAT_TIMESTAMP('%X', second_fastest) AS second_fastest + FROM ( + SELECT name, + finish_time, + division,finishers, + FIRST_VALUE(finish_time) + OVER w1 AS fastest_time, + NTH_VALUE(finish_time, 2) + OVER w1 as second_fastest + FROM finishers + WINDOW w1 AS ( + PARTITION BY division ORDER BY finish_time ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)) + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LEAD(name) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS followed_by + FROM finishers + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LEAD(name, 2) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS two_runners_back + FROM finishers + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + WITH finishers AS + (SELECT 'Sophia Liu' as name, + TIMESTAMP '2016-10-18 2:51:45' as finish_time, + 'F30-34' as division + UNION ALL SELECT 'Lisa Stelzner', TIMESTAMP '2016-10-18 2:54:11', 'F35-39' + UNION ALL SELECT 'Nikki Leith', TIMESTAMP '2016-10-18 2:59:01', 'F30-34' + UNION ALL SELECT 'Lauren Matthews', TIMESTAMP '2016-10-18 3:01:17', 'F35-39' + UNION ALL SELECT 'Desiree Berry', TIMESTAMP '2016-10-18 3:05:42', 'F35-39' + UNION ALL SELECT 'Suzy Slane', TIMESTAMP '2016-10-18 3:06:24', 'F35-39' + UNION ALL SELECT 'Jen Edwards', TIMESTAMP '2016-10-18 3:06:36', 'F30-34' + UNION ALL SELECT 'Meghan Lederer', TIMESTAMP '2016-10-18 3:07:41', 'F30-34' + UNION ALL SELECT 'Carly Forte', TIMESTAMP '2016-10-18 3:08:58', 'F25-29' + UNION ALL SELECT 'Lauren Reasoner', TIMESTAMP '2016-10-18 3:10:14', 'F30-34') + SELECT name, + finish_time, + division, + LAG(name) + OVER (PARTITION BY division ORDER BY finish_time ASC) AS preceding_runner + FROM finishers + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + SELECT + PERCENTILE_CONT(x, 0) OVER() AS min, + PERCENTILE_CONT(x, 0.01) OVER() AS percentile1, + PERCENTILE_CONT(x, 0.5) OVER() AS median, + PERCENTILE_CONT(x, 0.9) OVER() AS percentile90, + PERCENTILE_CONT(x, 1) OVER() AS max + FROM UNNEST([0, 3, NULL, 1, 2]) AS x LIMIT 1 + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions + [ + """ + SELECT + x, + PERCENTILE_DISC(x, 0) OVER() AS min, + PERCENTILE_DISC(x, 0.5) OVER() AS median, + PERCENTILE_DISC(x, 1) OVER() AS max + FROM UNNEST(['c', NULL, 'b', 'a']) AS x + """, + [ + ] + ], + + # From https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions + [ + """ + SELECT + TIMESTAMP "2008-12-25 15:30:00 UTC" as original, + TIMESTAMP_ADD(TIMESTAMP "2008-12-25 15:30:00 UTC", INTERVAL 10 MINUTE) AS later + """, + [ + ] + ], + + # Previously hosted on https://cloud.google.com/bigquery/docs/reference/standard-sql/timestamp_functions, but + # appears to no longer be there + [ + """ + WITH date_hour_slots AS ( + SELECT + [ + STRUCT( + " 00:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 01:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 02:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 03:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 04:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 05:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 06:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 07:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 08:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY ) as dt_range), + STRUCT( + " 09:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01', current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 10:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 11:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 12:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 13:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 14:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 15:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 16:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 17:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 18:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 19:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 20:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 21:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 22:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range), + STRUCT( + " 23:00:00 UTC" as hrs, + GENERATE_DATE_ARRAY('2016-01-01',current_date(), INTERVAL 1 DAY) as dt_range) + ] + AS full_timestamps) + SELECT + dt AS dates, hrs, CAST(CONCAT( CAST(dt as STRING), CAST(hrs as STRING)) as TIMESTAMP) as timestamp_value + FROM `date_hour_slots`, date_hour_slots.full_timestamps LEFT JOIN full_timestamps.dt_range as dt + """, + [ + (None, "date_hour_slots", "full_timestamps"), + (None, "full_timestamps", "dt_range"), + ] + ], + [ + """ + SELECT + [foo], + ARRAY[foo], + ARRAY<int64, STRING>[foo, bar], + STRUCT(1, 3), + STRUCT<int64, STRING>(2, 'foo'), + current_date(), + GENERATE_ARRAY(5, NULL, 1), + GENERATE_DATE_ARRAY('2016-10-05', '2016-10-01', INTERVAL 1 DAY), + GENERATE_DATE_ARRAY('2016-10-05', NULL), + GENERATE_DATE_ARRAY('2016-01-01', '2016-12-31', INTERVAL 2 MONTH), + GENERATE_DATE_ARRAY('2000-02-01',current_date(), INTERVAL 1 DAY), + GENERATE_TIMESTAMP_ARRAY('2016-10-05 00:00:00', '2016-10-05 00:00:02', INTERVAL 1 SECOND) + FROM + bar + """, + [ + (None, None, "bar"), + ] + ], + [ + """ + SELECT GENERATE_ARRAY(start, 5) AS example_array + FROM UNNEST([3, 4, 5]) AS start + """, + [ + ] + ], + [ + """ + WITH StartsAndEnds AS ( + SELECT DATE '2016-01-01' AS date_start, DATE '2016-01-31' AS date_end + UNION ALL SELECT DATE "2016-04-01", DATE "2016-04-30" + UNION ALL SELECT DATE "2016-07-01", DATE "2016-07-31" + UNION ALL SELECT DATE "2016-10-01", DATE "2016-10-31" + ) + SELECT GENERATE_DATE_ARRAY(date_start, date_end, INTERVAL 1 WEEK) AS date_range + FROM StartsAndEnds + """, + [ + ] + ], + [ + """ + SELECT GENERATE_TIMESTAMP_ARRAY(start_timestamp, end_timestamp, INTERVAL 1 HOUR) + AS timestamp_array + FROM + (SELECT + TIMESTAMP '2016-10-05 00:00:00' AS start_timestamp, + TIMESTAMP '2016-10-05 02:00:00' AS end_timestamp + UNION ALL + SELECT + TIMESTAMP '2016-10-05 12:00:00' AS start_timestamp, + TIMESTAMP '2016-10-05 14:00:00' AS end_timestamp + UNION ALL + SELECT + TIMESTAMP '2016-10-05 23:59:00' AS start_timestamp, + TIMESTAMP '2016-10-06 01:59:00' AS end_timestamp) + """, + [ + ] + ], + [ + """ + SELECT DATE_SUB(current_date("-08:00")), INTERVAL 2 DAY) + """, + [ + ] + ], + [ + """ + SELECT + case when (a) then b else c end + FROM d + """, + [ + (None, None, "d",), + ] + ], + [ + """ + SELECT + e, + case when (f) then g else h end + FROM i + """, + [ + (None, None, "i",), + ] + ], + [ + """ + SELECT + case when j then k else l end + FROM m + """, + [ + (None, None, "m",), + ] + ], + [ + """ + SELECT + n, + case when o then p else q end + FROM r + """, + [ + (None, None, "r",), + ] + ], + [ + """ + SELECT + case s when (t) then u else v end + FROM w + """, + [ + (None, None, "w",), + ] + ], + [ + """ + SELECT + x, + case y when (z) then aa else ab end + FROM ac + """, + [ + (None, None, "ac",), + ] + ], + [ + """ + SELECT + case ad when ae then af else ag end + FROM ah + """, [ + (None, None, "ah",), + ] + ], + [ + """ + SELECT + ai, + case aj when ak then al else am end + FROM an + """, + [ + (None, None, "an",), + ] + ], + [ + """ + WITH + ONE AS (SELECT x FROM y), + TWO AS (select a FROM b) + SELECT y FROM onE JOIN TWo + """, + [ + (None, None, "y",), + (None, None, "b",), + ] + ], + [ + """ + SELECT + a, + (SELECT b FROM oNE) + FROM OnE + """, + [ + (None, None, "oNE",), + (None, None, "OnE",), + ] + ], + [ + """ + SELECT * FROM `a.b.c` + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM `b.c` + """, + [ + (None, "b", "c"), + ] + ], + [ + """ + SELECT * FROM `c` + """, + [ + (None, None, "c"), + ] + ], [ + """ + SELECT * FROM a.b.c + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM "a"."b"."c" + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM 'a'.'b'.'c' + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM `a`.`b`.`c` + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM "a.b.c" + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM 'a.b.c' + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * FROM `a.b.c` + """, + [ + ("a", "b", "c"), + ] + ], + [ + """ + SELECT * + FROM t1 + WHERE t1.a IN (SELECT t2.a + FROM t2 ) FOR SYSTEM_TIME AS OF t1.timestamp_column) + """, + [ + (None, None, "t1"), + (None, None, "t2"), + ] + ], + [ + """ + WITH a AS (SELECT b FROM c) + SELECT d FROM A JOIN e ON f = g JOIN E ON h = i + """, + [ + (None, None, "c"), + (None, None, "e"), + (None, None, "E"), + ] + ], + [ + """ + with + a as ( + ( + select b from + ( + select c from d + ) + Union all + ( + select e from f + ) + ) + ) + + select g from h + """, + [ + (None, None, 'd'), + (None, None, 'f'), + (None, None, 'h'), + ] + ], + [ + """ + select + a AS ESCAPE, + b AS CURRENT_TIME, + c AS CURRENT_DATE, + d AS CURRENT_TIMESTAMP, + e AS DATE_ADD + FROM x + """, + [ + (None, None, 'x'), + ] + ] + ] + + def test(self): + for test_index, test_case in enumerate(BigQueryViewParser.TEST_CASES): + sql_stmt, expected_tables = test_case + + found_tables = self.get_table_names(sql_stmt) + expected_tables_set = set(expected_tables) + + if expected_tables_set != found_tables: + raise Exception(f"Test {test_index} failed- expected {expected_tables_set} but got {found_tables}") + + +if __name__ == '__main__': + BigQueryViewParser().test() |