1 files changed, 293 insertions, 0 deletions
diff --git a/src/examples/searchparser.py b/src/examples/searchparser.py
new file mode 100644
index 0000000..f9ee428
--- /dev/null
+++ b/src/examples/searchparser.py
@@ -0,0 +1,293 @@
+"""Search query parser
+
+version 2006-03-09
+
+This search query parser uses the excellent Pyparsing module 
+(http://pyparsing.sourceforge.net/) to parse search queries by users.
+It handles:
+
+* 'and', 'or' and implicit 'and' operators;
+* parentheses;
+* quoted strings;
+* wildcards at the end of a search term (help*);
+
+Requirements:
+* Python
+* Pyparsing
+
+If you run this script, it will perform a number of tests. To use is as a
+module, you should use inheritance on the SearchQueryParser class and overwrite
+the Get... methods. The ParserTest class gives a very simple example of how this
+could work.
+
+-------------------------------------------------------------------------------
+Copyright (c) 2006, Estrate, the Netherlands
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation 
+  and/or other materials provided with the distribution.
+* Neither the name of Estrate nor the names of its contributors may be used
+  to endorse or promote products derived from this software without specific
+  prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+CONTRIBUTORS:
+- Steven Mooij
+- Rudolph Froger
+- Paul McGuire
+
+TODO:
+- add more docs
+- ask someone to check my English texts
+- add more kinds of wildcards ('*' at the beginning and '*' inside a word)?
+"""
+from pyparsing import Word, alphanums, Keyword, Group, Combine, Forward, Suppress, Optional, OneOrMore, oneOf
+from sets import Set
+
+class SearchQueryParser:
+
+    def __init__(self):
+        self._methods = {
+            'and': self.evaluateAnd,
+            'or': self.evaluateOr,
+            'not': self.evaluateNot,
+            'parenthesis': self.evaluateParenthesis,
+            'quotes': self.evaluateQuotes,
+            'word': self.evaluateWord,
+            'wordwildcard': self.evaluateWordWildcard,
+        }
+        self._parser = self.parser()
+    
+    def parser(self):
+        """
+        This function returns a parser.
+        The grammar should be like most full text search engines (Google, Tsearch, Lucene).
+        
+        Grammar:
+        - a query consists of alphanumeric words, with an optional '*' wildcard
+          at the end of a word
+        - a sequence of words between quotes is a literal string
+        - words can be used together by using operators ('and' or 'or')
+        - words with operators can be grouped with parenthesis
+        - a word or group of words can be preceded by a 'not' operator
+        - the 'and' operator precedes an 'or' operator
+        - if an operator is missing, use an 'and' operator
+        """
+        operatorOr = Forward()
+        
+        operatorWord = Group(Combine(Word(alphanums) + Suppress('*'))).setResultsName('wordwildcard') | \
+                            Group(Word(alphanums)).setResultsName('word')
+        
+        operatorQuotesContent = Forward()
+        operatorQuotesContent << (
+            (operatorWord + operatorQuotesContent) | operatorWord
+        )
+        
+        operatorQuotes = Group(
+            Suppress('"') + operatorQuotesContent + Suppress('"')
+        ).setResultsName("quotes") | operatorWord
+        
+        operatorParenthesis = Group(
+            (Suppress("(") + operatorOr + Suppress(")"))
+        ).setResultsName("parenthesis") | operatorQuotes
+
+        operatorNot = Forward()
+        operatorNot << (Group(
+            Suppress(Keyword("not", caseless=True)) + operatorNot
+        ).setResultsName("not") | operatorParenthesis)
+
+        operatorAnd = Forward()
+        operatorAnd << (Group(
+            operatorNot + Suppress(Keyword("and", caseless=True)) + operatorAnd
+        ).setResultsName("and") | Group(
+            operatorNot + OneOrMore(~oneOf("and or") + operatorAnd)
+        ).setResultsName("and") | operatorNot)
+        
+        operatorOr << (Group(
+            operatorAnd + Suppress(Keyword("or", caseless=True)) + operatorOr
+        ).setResultsName("or") | operatorAnd)
+
+        return operatorOr.parseString
+
+    def evaluateAnd(self, argument):
+        return self.evaluate(argument[0]).intersection(self.evaluate(argument[1]))
+
+    def evaluateOr(self, argument):
+        return self.evaluate(argument[0]).union(self.evaluate(argument[1]))
+
+    def evaluateNot(self, argument):
+        return self.GetNot(self.evaluate(argument[0]))
+
+    def evaluateParenthesis(self, argument):
+        return self.evaluate(argument[0])
+
+    def evaluateQuotes(self, argument):
+        """Evaluate quoted strings
+
+        First is does an 'and' on the indidual search terms, then it asks the
+        function GetQuoted to only return the subset of ID's that contain the
+        literal string.
+        """
+        r = Set()
+        search_terms = []
+        for item in argument:
+            search_terms.append(item[0])
+            if len(r) == 0:
+                r = self.evaluate(item)
+            else:
+                r = r.intersection(self.evaluate(item))
+        return self.GetQuotes(' '.join(search_terms), r)
+
+    def evaluateWord(self, argument):
+        return self.GetWord(argument[0])
+
+    def evaluateWordWildcard(self, argument):
+        return self.GetWordWildcard(argument[0])
+        
+    def evaluate(self, argument):
+        return self._methods[argument.getName()](argument)
+
+    def Parse(self, query):
+        #print self._parser(query)[0]
+        return self.evaluate(self._parser(query)[0])
+
+    def GetWord(self, word):
+        return Set()
+
+    def GetWordWildcard(self, word):
+        return Set()
+
+    def GetQuotes(self, search_string, tmp_result):
+        return Set()
+
+    def GetNot(self, not_set):
+        return Set().difference(not_set)
+
+
+class ParserTest(SearchQueryParser):
+    """Tests the parser with some search queries
+    tests containts a dictionary with tests and expected results.
+    """
+    tests = {
+        'help': Set([1, 2, 4, 5]),
+        'help or hulp': Set([1, 2, 3, 4, 5]),
+        'help and hulp': Set([2]),
+        'help hulp': Set([2]),
+        'help and hulp or hilp': Set([2, 3, 4]),
+        'help or hulp and hilp': Set([1, 2, 3, 4, 5]),
+        'help or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6]),
+        '(help or hulp) and (hilp or halp)': Set([3, 4, 5]),
+        'help and (hilp or halp)': Set([4, 5]),
+        '(help and (hilp or halp)) or hulp': Set([2, 3, 4, 5]),
+        'not help': Set([3, 6, 7, 8]),
+        'not hulp and halp': Set([5, 6]),
+        'not (help and halp)': Set([1, 2, 3, 4, 6, 7, 8]),
+        '"help me please"': Set([2]),
+        '"help me please" or hulp': Set([2, 3]),
+        '"help me please" or (hulp and halp)': Set([2]),
+        'help*': Set([1, 2, 4, 5, 8]),
+        'help or hulp*': Set([1, 2, 3, 4, 5]),
+        'help* and hulp': Set([2]),
+        'help and hulp* or hilp': Set([2, 3, 4]),
+        'help* or hulp or hilp or halp': Set([1, 2, 3, 4, 5, 6, 8]),
+        '(help or hulp*) and (hilp* or halp)': Set([3, 4, 5]),
+        'help* and (hilp* or halp*)': Set([4, 5]),
+        '(help and (hilp* or halp)) or hulp*': Set([2, 3, 4, 5]),
+        'not help* and halp': Set([6]),
+        'not (help* and helpe*)': Set([1, 2, 3, 4, 5, 6, 7]),
+        '"help* me please"': Set([2]),
+        '"help* me* please" or hulp*': Set([2, 3]),
+        '"help me please*" or (hulp and halp)': Set([2]),
+        '"help me please" not (hulp and halp)': Set([2]),
+        '"help me please" hulp': Set([2]),
+        'help and hilp and not holp': Set([4]),
+        'help hilp not holp': Set([4]),
+        'help hilp and not holp': Set([4]),
+    }
+
+    docs = {
+        1: 'help',
+        2: 'help me please hulp',
+        3: 'hulp hilp',
+        4: 'help hilp',
+        5: 'halp thinks he needs help',
+        6: 'he needs halp',
+        7: 'nothing',
+        8: 'helper',
+    }
+        
+    index = {
+        'help': Set((1, 2, 4, 5)),
+        'me': Set((2,)),
+        'please': Set((2,)),
+        'hulp': Set((2, 3,)),
+        'hilp': Set((3, 4,)),
+        'halp': Set((5, 6,)),
+        'thinks': Set((5,)),
+        'he': Set((5, 6,)),
+        'needs': Set((5, 6,)),
+        'nothing': Set((7,)),
+        'helper': Set((8,)),
+    }
+
+    def GetWord(self, word):
+        if (self.index.has_key(word)):
+            return self.index[word]
+        else:
+            return Set()
+
+    def GetWordWildcard(self, word):
+        result = Set()
+        for item in self.index.keys():
+            if word == item[0:len(word)]:
+                result = result.union(self.index[item])
+        return result
+
+    def GetQuotes(self, search_string, tmp_result):
+        result = Set()
+        for item in tmp_result:
+            if self.docs[item].count(search_string):
+                result.add(item)
+        return result
+    
+    def GetNot(self, not_set):
+        all = Set(self.docs.keys())
+        return all.difference(not_set)
+
+    def Test(self):
+        all_ok = True
+        for item in self.tests.keys():
+            print item
+            r = self.Parse(item)
+            e = self.tests[item]
+            print 'Result: %s' % r
+            print 'Expect: %s' % e
+            if e == r:
+                print 'Test OK'
+            else:
+                all_ok = False
+                print '>>>>>>>>>>>>>>>>>>>>>>Test ERROR<<<<<<<<<<<<<<<<<<<<<'
+            print ''
+        return all_ok
+            
+if __name__=='__main__':
+    if ParserTest().Test():
+        print 'All tests OK'
+    else:
+        print 'One or more tests FAILED'