1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
|
require_relative "query/binary_operator"
require_relative "query/unary_operator"
require_relative "query/term"
require_relative "query/phrase"
require_relative "query/range_query"
require_relative "query/subquery"
module ChefZero
module Solr
class ParseError < RuntimeError; end
class SolrParser
def initialize(query_string)
@query_string = query_string
@index = 0
end
def parse
read_expression
end
#
# Tokenization
#
def peek_token
@next_token ||= parse_token
end
def next_token
result = peek_token
@next_token = nil
result
end
def parse_token
# Skip whitespace
skip_whitespace
return nil if eof?
# Operators
operator = peek_operator_token
if operator
@index += operator.length
operator
else
# Everything that isn't whitespace or an operator, is part of a term
# (characters plus backslashed escaped characters)
start_index = @index
loop do
if @query_string[@index] == '\\'
@index += 1
end
@index += 1 unless eof?
break if eof? || !peek_term_token
end
@query_string[start_index..@index - 1]
end
end
def skip_whitespace
if @query_string[@index] =~ /\s/
whitespace = /\s+/.match(@query_string, @index) || peek
@index += whitespace[0].length
end
end
def peek_term_token
return nil if @query_string[@index] =~ /\s/
op = peek_operator_token
!op || op == "-"
end
def peek_operator_token
if ['"', "+", "-", "!", "(", ")", "{", "}", "[", "]", "^", ":"].include?(@query_string[@index])
return @query_string[@index]
else
result = @query_string[@index..@index + 1]
if ["&&", "||"].include?(result)
return result
end
end
nil
end
def eof?
!@next_token && @index >= @query_string.length
end
# Parse tree creation
def read_expression
result = read_single_expression
# Expression is over when we hit a close paren or eof
# (peek_token has the side effect of skipping whitespace for us, so we
# really know if we're at eof or not)
until peek_token == ")" || eof?
operator = peek_token
if binary_operator?(operator)
next_token
else
# If 2 terms are next to each other, the default operator is OR
operator = "OR"
end
next_expression = read_single_expression
# Build the operator, taking precedence into account
if result.is_a?(Query::BinaryOperator) &&
binary_operator_precedence(operator) > binary_operator_precedence(result.operator)
# a+b*c -> a+(b*c)
new_right = Query::BinaryOperator.new(result.right, operator, next_expression)
result = Query::BinaryOperator.new(result.left, result.operator, new_right)
else
# a*b+c -> (a*b)+c
result = Query::BinaryOperator.new(result, operator, next_expression)
end
end
result
end
def parse_error(token, str)
raise ChefZero::Solr::ParseError, "Error on token '#{token}' at #{@index} of '#{@query_string}': #{str}"
end
def read_single_expression
token = next_token
# If EOF, we have a problem Houston
if !token
parse_error(nil, "Expected expression!")
# If it's an unary operand, build that
elsif unary_operator?(token)
operand = read_single_expression
# TODO We rely on all unary operators having higher precedence than all
# binary operators. Check if this is the case.
Query::UnaryOperator.new(token, operand)
# If it's the start of a phrase, read the terms in the phrase
elsif token == '"'
# Read terms until close "
phrase_terms = []
until (term = next_token) == '"'
phrase_terms << Query::Term.new(term)
end
Query::Phrase.new(phrase_terms)
# If it's the start of a range query, build that
elsif token == "{" || token == "["
left = next_token
parse_error(left, "Expected left term in range query") unless left
to = next_token
parse_error(left, "Expected TO in range query") if to != "TO"
right = next_token
parse_error(right, "Expected left term in range query") unless right
end_range = next_token
parse_error(right, "Expected end range '#{end_range}") unless ["}", "]"].include?(end_range)
Query::RangeQuery.new(left, right, token == "[", end_range == "]")
elsif token == "("
subquery = read_expression
close_paren = next_token
parse_error(close_paren, "Expected ')'") if close_paren != ")"
Query::Subquery.new(subquery)
# If it's the end of a closure, raise an exception
elsif ["}", "]", ")"].include?(token)
parse_error(token, "Unexpected end paren")
# If it's a binary operator, raise an exception
elsif binary_operator?(token)
parse_error(token, "Unexpected binary operator")
# Otherwise it's a term.
else
term = Query::Term.new(token)
if peek_token == ":"
Query::BinaryOperator.new(term, next_token, read_single_expression)
else
term
end
end
end
def unary_operator?(token)
[ "NOT", "+", "-", "!" ].include?(token)
end
def binary_operator?(token)
[ "AND", "OR", "^", ":"].include?(token)
end
def binary_operator_precedence(token)
case token
when "^"
4
when ":"
3
when "AND"
2
when "OR"
1
end
end
DEFAULT_FIELD = "text".freeze
end
end
end
|