summaryrefslogtreecommitdiff
path: root/lib/chef_zero/solr/solr_parser.rb
blob: 843a1b523651db6453487e1ae7e0aca2e8b2dda2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
require_relative "query/binary_operator"
require_relative "query/unary_operator"
require_relative "query/term"
require_relative "query/phrase"
require_relative "query/range_query"
require_relative "query/subquery"

module ChefZero
  module Solr
    class ParseError < RuntimeError; end

    class SolrParser
      def initialize(query_string)
        @query_string = query_string
        @index = 0
      end

      def parse
        read_expression
      end

      #
      # Tokenization
      #
      def peek_token
        @next_token ||= parse_token
      end

      def next_token
        result = peek_token
        @next_token = nil
        result
      end

      def parse_token
        # Skip whitespace
        skip_whitespace
        return nil if eof?

        # Operators
        operator = peek_operator_token
        if operator
          @index += operator.length
          operator
        else
          # Everything that isn't whitespace or an operator, is part of a term
          # (characters plus backslashed escaped characters)
          start_index = @index
          loop do
            if @query_string[@index] == '\\'
              @index += 1
            end
            @index += 1 unless eof?
            break if eof? || !peek_term_token
          end
          @query_string[start_index..@index - 1]
        end
      end

      def skip_whitespace
        if @query_string[@index] =~ /\s/
          whitespace = /\s+/.match(@query_string, @index) || peek
          @index += whitespace[0].length
        end
      end

      def peek_term_token
        return nil if @query_string[@index] =~ /\s/

        op = peek_operator_token
        !op || op == "-"
      end

      def peek_operator_token
        if ['"', "+", "-", "!", "(", ")", "{", "}", "[", "]", "^", ":"].include?(@query_string[@index])
          return @query_string[@index]
        else
          result = @query_string[@index..@index + 1]
          if ["&&", "||"].include?(result)
            return result
          end
        end

        nil
      end

      def eof?
        !@next_token && @index >= @query_string.length
      end

      # Parse tree creation
      def read_expression
        result = read_single_expression
        # Expression is over when we hit a close paren or eof
        # (peek_token has the side effect of skipping whitespace for us, so we
        # really know if we're at eof or not)
        until peek_token == ")" || eof?
          operator = peek_token
          if binary_operator?(operator)
            next_token
          else
            # If 2 terms are next to each other, the default operator is OR
            operator = "OR"
          end
          next_expression = read_single_expression

          # Build the operator, taking precedence into account
          if result.is_a?(Query::BinaryOperator) &&
              binary_operator_precedence(operator) > binary_operator_precedence(result.operator)
            # a+b*c -> a+(b*c)
            new_right = Query::BinaryOperator.new(result.right, operator, next_expression)
            result = Query::BinaryOperator.new(result.left, result.operator, new_right)
          else
            # a*b+c -> (a*b)+c
            result = Query::BinaryOperator.new(result, operator, next_expression)
          end
        end
        result
      end

      def parse_error(token, str)
        raise ChefZero::Solr::ParseError, "Error on token '#{token}' at #{@index} of '#{@query_string}': #{str}"
      end

      def read_single_expression
        token = next_token
        # If EOF, we have a problem Houston
        if !token
          parse_error(nil, "Expected expression!")

        # If it's an unary operand, build that
        elsif unary_operator?(token)
          operand = read_single_expression
          # TODO We rely on all unary operators having higher precedence than all
          # binary operators.  Check if this is the case.
          Query::UnaryOperator.new(token, operand)

        # If it's the start of a phrase, read the terms in the phrase
        elsif token == '"'
          # Read terms until close "
          phrase_terms = []
          until (term = next_token) == '"'
            phrase_terms << Query::Term.new(term)
          end
          Query::Phrase.new(phrase_terms)

        # If it's the start of a range query, build that
        elsif token == "{" || token == "["
          left = next_token
          parse_error(left, "Expected left term in range query") unless left
          to = next_token
          parse_error(left, "Expected TO in range query") if to != "TO"
          right = next_token
          parse_error(right, "Expected left term in range query") unless right
          end_range = next_token
          parse_error(right, "Expected end range '#{end_range}") unless ["}", "]"].include?(end_range)
          Query::RangeQuery.new(left, right, token == "[", end_range == "]")

        elsif token == "("
          subquery = read_expression
          close_paren = next_token
          parse_error(close_paren, "Expected ')'") if close_paren != ")"
          Query::Subquery.new(subquery)

        # If it's the end of a closure, raise an exception
        elsif ["}", "]", ")"].include?(token)
          parse_error(token, "Unexpected end paren")

        # If it's a binary operator, raise an exception
        elsif binary_operator?(token)
          parse_error(token, "Unexpected binary operator")

        # Otherwise it's a term.
        else
          term = Query::Term.new(token)
          if peek_token == ":"
            Query::BinaryOperator.new(term, next_token, read_single_expression)
          else
            term
          end
        end
      end

      def unary_operator?(token)
        [ "NOT", "+", "-", "!" ].include?(token)
      end

      def binary_operator?(token)
        [ "AND", "OR", "^", ":"].include?(token)
      end

      def binary_operator_precedence(token)
        case token
        when "^"
          4
        when ":"
          3
        when "AND"
          2
        when "OR"
          1
        end
      end

      DEFAULT_FIELD = "text".freeze
    end
  end
end