summaryrefslogtreecommitdiff
path: root/lib/chef_zero/solr/solr_parser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/chef_zero/solr/solr_parser.rb')
-rw-r--r--lib/chef_zero/solr/solr_parser.rb194
1 files changed, 194 insertions, 0 deletions
diff --git a/lib/chef_zero/solr/solr_parser.rb b/lib/chef_zero/solr/solr_parser.rb
new file mode 100644
index 0000000..589b78f
--- /dev/null
+++ b/lib/chef_zero/solr/solr_parser.rb
@@ -0,0 +1,194 @@
+require 'chef_zero/solr/query/binary_operator'
+require 'chef_zero/solr/query/unary_operator'
+require 'chef_zero/solr/query/term'
+require 'chef_zero/solr/query/phrase'
+require 'chef_zero/solr/query/range_query'
+require 'chef_zero/solr/query/subquery'
+
+module ChefZero
+ module Solr
+ class SolrParser
+ def initialize(query_string)
+ @query_string = query_string
+ @index = 0
+ end
+
+ def parse
+ read_expression
+ end
+
+ #
+ # Tokenization
+ #
+ def peek_token
+ @next_token ||= parse_token
+ end
+
+ def next_token
+ result = peek_token
+ @next_token = nil
+ result
+ end
+
+ def parse_token
+ # Skip whitespace
+ skip_whitespace
+ return nil if eof?
+
+ # Operators
+ operator = peek_operator_token
+ if operator
+ @index+=operator.length
+ operator
+ else
+ # Everything that isn't whitespace or an operator, is part of a term
+ # (characters plus backslashed escaped characters)
+ start_index = @index
+ begin
+ if @query_string[@index] == '\\'
+ @index+=1
+ end
+ @index+=1 if !eof?
+ end until eof? || @query_string[@index] =~ /\s/ || peek_operator_token
+ @query_string[start_index..@index-1]
+ end
+ end
+
+ def skip_whitespace
+ if @query_string[@index] =~ /\s/
+ whitespace = /\s+/.match(@query_string, @index)
+ @index += whitespace[0].length
+ end
+ end
+
+ def peek_operator_token
+ if ['"', '+', '-', '!', '(', ')', '{', '}', '[', ']', '^', ':'].include?(@query_string[@index])
+ return @query_string[@index]
+ else
+ result = @query_string[@index..@index+1]
+ if ['&&', '||'].include?(result)
+ return result
+ end
+ end
+ nil
+ end
+
+ def eof?
+ !@next_token && @index >= @query_string.length
+ end
+
+ # Parse tree creation
+ def read_expression
+ result = read_single_expression
+ # Expression is over when we hit a close paren or eof
+ # (peek_token has the side effect of skipping whitespace for us, so we
+ # really know if we're at eof or not)
+ until peek_token == ')' || eof?
+ operator = peek_token
+ if binary_operator?(operator)
+ next_token
+ else
+ # If 2 terms are next to each other, the default operator is OR
+ operator = 'OR'
+ end
+ next_expression = read_single_expression
+
+ # Build the operator, taking precedence into account
+ if result.is_a?(Query::BinaryOperator) &&
+ binary_operator_precedence(operator) > binary_operator_precedence(result.operator)
+ # a+b*c -> a+(b*c)
+ new_right = Query::BinaryOperator.new(result.right, operator, next_expression)
+ result = Query::BinaryOperator.new(result.left, result.operator, new_right)
+ else
+ # a*b+c -> (a*b)+c
+ result = Query::BinaryOperator.new(result, operator, next_expression)
+ end
+ end
+ result
+ end
+
+ def parse_error(token, str)
+ error = "Error on token '#{token}' at #{@index} of '#{@query_string}': #{str}"
+ puts error
+ raise error
+ end
+
+ def read_single_expression
+ token = next_token
+ # If EOF, we have a problem Houston
+ if !token
+ parse_error(nil, "Expected expression!")
+
+ # If it's an unary operand, build that
+ elsif unary_operator?(token)
+ operand = read_single_expression
+ # TODO We rely on all unary operators having higher precedence than all
+ # binary operators. Check if this is the case.
+ Query::UnaryOperator.new(token, operand)
+
+ # If it's the start of a phrase, read the terms in the phrase
+ elsif token == '"'
+ # Read terms until close "
+ phrase_terms = []
+ until (term = next_token) == '"'
+ phrase_terms << Query::Term.new(term)
+ end
+ Query::Phrase.new(phrase_terms)
+
+ # If it's the start of a range query, build that
+ elsif token == '{' || token == '['
+ left = next_token
+ parse_error(left, "Expected left term in range query") if !left
+ to = next_token
+ parse_error(left, "Expected TO in range query") if to != "TO"
+ right = next_token
+ parse_error(right, "Expected left term in range query") if !right
+ end_range = next_token
+ parse_error(right, "Expected end range '#{expected_end_range}") if !['{', '['].include?(end_range)
+ Query::RangeQuery.new(left, right, token == '[', end_range == ']')
+
+ elsif token == '('
+ subquery = read_expression
+ close_paren = next_token
+ parse_error(close_paren, "Expected ')'") if close_paren != ')'
+ Query::Subquery.new(subquery)
+
+ # If it's the end of a closure, raise an exception
+ elsif ['}',']',')'].include?(token)
+ parse_error(token, "Unexpected end paren")
+
+ # If it's a binary operator, raise an exception
+ elsif binary_operator?(token)
+ parse_error(token, "Unexpected binary operator")
+
+ # Otherwise it's a term.
+ else
+ Query::Term.new(token)
+ end
+ end
+
+ def unary_operator?(token)
+ [ 'NOT', '+', '-' ].include?(token)
+ end
+
+ def binary_operator?(token)
+ [ 'AND', 'OR', '^', ':'].include?(token)
+ end
+
+ def binary_operator_precedence(token)
+ case token
+ when '^'
+ 4
+ when ':'
+ 3
+ when 'AND'
+ 2
+ when 'OR'
+ 1
+ end
+ end
+
+ DEFAULT_FIELD = 'text'
+ end
+ end
+end