sandbox/py-rest-doc/sphinx/search.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

# -*- coding: utf-8 -*-
"""
    sphinx.search
    ~~~~~~~~~~~~~

    Create a search index for offline search.

    :copyright: 2007 by Armin Ronacher.
    :license: Python license.
"""
import re
import pickle

from collections import defaultdict
from docutils.nodes import Text, NodeVisitor
from .stemmer import PorterStemmer
from .json import dump_json


word_re = re.compile(r'\w+(?u)')


class Stemmer(PorterStemmer):
    """
    All those porter stemmer implementations look hideous.
    make at least the stem method nicer.
    """

    def stem(self, word):
        return PorterStemmer.stem(self, word, 0, len(word) - 1)


class WordCollector(NodeVisitor):
    """
    A special visitor that collects words for the `IndexBuilder`.
    """

    def __init__(self, document):
        NodeVisitor.__init__(self, document)
        self.found_words = []

    def dispatch_visit(self, node):
        if node.__class__ is Text:
            self.found_words.extend(word_re.findall(node.astext()))


class IndexBuilder(object):
    """
    Helper class that creates a searchindex based on the doctrees
    passed to the `feed` method.
    """
    formats = {
        'json':     dump_json,
        'pickle':   pickle.dumps
    }

    def __init__(self):
        self._filenames = {}
        self._mapping = {}
        self._titles = {}
        self._categories = {}
        self._stemmer = Stemmer()

    def dump(self, stream, format):
        """Dump the freezed index to a stream."""
        stream.write(self.formats[format](self.freeze()))

    def freeze(self):
        """
        Create a useable data structure. You can pass this output
        to the `SearchFrontend` to search the index.
        """
        return [
            [k for k, v in sorted(self._filenames.items(),
                                  key=lambda x: x[1])],
            dict(item for item in sorted(self._categories.items(),
                                         key=lambda x: x[0])),
            [v for k, v in sorted(self._titles.items(),
                                  key=lambda x: x[0])],
            dict(item for item in sorted(self._mapping.items(),
                                         key=lambda x: x[0])),
        ]

    def feed(self, filename, category, title, doctree):
        """Feed a doctree to the index."""
        file_id = self._filenames.setdefault(filename, len(self._filenames))
        self._titles[file_id] = title
        visitor = WordCollector(doctree)
        doctree.walk(visitor)
        self._categories.setdefault(category, set()).add(file_id)
        for word in word_re.findall(title) + visitor.found_words:
            self._mapping.setdefault(self._stemmer.stem(word.lower()),
                                     set()).add(file_id)


class SearchFrontend(object):
    """
    This class acts as a frontend for the search index. It can search
    a searchindex as provided by `IndexBuilder`.
    """

    def __init__(self, index):
        self.filenames, self.areas, self.titles, self.words = index
        self._stemmer = Stemmer()

    def query(self, required, excluded, areas):
        file_map = defaultdict(set)
        for word in required:
            if word not in self.words:
                break
            for fid in self.words[word]:
                file_map[fid].add(word)

        return sorted(((self.filenames[fid], self.titles[fid])
            for fid, words in file_map.iteritems()
            if len(words) == len(required) and
               any(fid in self.areas.get(area, ()) for area in areas) and not
               any(fid in self.words.get(word, ()) for word in excluded)
        ), key=lambda x: x[1].lower())

    def search(self, searchstring, areas):
        required = set()
        excluded = set()
        for word in searchstring.split():
            if word.startswith('-'):
                storage = excluded
                word = word[1:]
            else:
                storage = required
            storage.add(self._stemmer.stem(word.lower()))

        return self.query(required, excluded, areas)