root/pyndexter/trunk/pyndexter/indexers/builtin.py @ 449

Revision 449, 9.1 KB (checked in by athomas, 3 years ago)

pyndexter: Moving to a cleaner API - removed source and state code.

  • Property svn:eol-style set to native
Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4#
5# This software is licensed as described in the file COPYING, which
6# you should have received as part of this distribution.
7#
8
9"""
10Builtin Indexer
11---------------
12
13The builtin Pyndexter indexer.
14
15Pyndexter provides a basic inverted index indexer. It does not currently
16support substring matching, wildcards, or scoring, but these features are
17planned.
18
19Usage
20~~~~~
21
22::
23
24    builtin://<path>?compact=<bool>&cache=<bool>&dbm=<dbm>
25
26``compact=<bool>`` (default: ``true``)
27    Whether to compact the database as much as possible. Slight slowdown.
28
29``cache=<bool>`` (default: ``false``)
30    Should we keep a cached copy of each document as it is indexed?
31
32``dbm=<dbm>`` (default: ``anydbm``)
33    Supported dbm's are ``anydbm``, ``dbhash``, ``gdbm`` and ``dbm`` (Python 2.5).
34
35Installation
36~~~~~~~~~~~~
37
38No installation is required. Pyndexter uses the anydbm Python module for
39storage.
40"""
41
42import os
43import re
44import anydbm
45import cPickle as pickle
46import md5
47from UserDict import DictMixin
48from StringIO import StringIO
49from gzip import GzipFile
50from pyndexter import *
51from pyndexter.util import set
52
53
54class KeyedSet(object):
55    def __init__(self, db):
56        self.db = db
57
58    def update(self, key, values):
59        key = pickle.dumps(key, 2)
60        try:
61            v = pickle.loads(self.db[key])
62        except KeyError:
63            v = set()
64        v.update(values)
65        self.db[key] = pickle.dumps(v, 2)
66
67    def remove(self, key, values=None):
68        key = pickle.dumps(key, 2)
69        if values is None:
70            try:
71                del self.db[key]
72            except KeyError:
73                pass
74        else:
75            try:
76                v = pickle.loads(self.db[key])
77                v.remove(values)
78                self.db[key] = pickle.dumps(v, 2)
79            except KeyError:
80                pass
81
82    def replace(self, key, values):
83        key = pickle.dumps(key, 2)
84        self.db[key] = pickle.dumps(values, 2)
85
86    def get(self, key):
87        key = pickle.dumps(key, 2)
88        try:
89            return pickle.loads(self.db[key])
90        except KeyError:
91            return set()
92
93    def keys(self):
94        for key in self.db.keys():
95            yield pickle.loads(key)
96
97
98class PickleDict(DictMixin):
99    """A dictionary wrapper that automatically pickles keys and values."""
100    def __init__(self, db):
101        self.db = db
102
103    def __getitem__(self, key):
104        return pickle.loads(self.db[pickle.dumps(key, 2)])
105
106    def __setitem__(self, key, value):
107        self.db[pickle.dumps(key, 2)] = pickle.dumps(value, 2)
108
109    def __delitem__(self, key):
110        del self.db[pickle.dumps(key, 2)]
111
112    def keys(self):
113        return [pickle.loads(k) for k in self.db.keys()]
114
115
116class BuiltinIndexer(Indexer):
117    """Builtin Pyndexter indexer."""
118    def __init__(self, framework, path, dbm='anydbm', cache=False,
119                 compact=True):
120        Indexer.__init__(self, framework)
121
122        self.path = path
123        self.compact = compact
124        self.cache = cache
125        self.state_path = os.path.join(path, 'store.db')
126        self.db_path = os.path.join(path, 'builtin.db')
127
128        # We want the minimum set of words
129        framework.reduce.split = True
130        framework.reduce.unique = True
131
132        dbm = __import__(dbm, {}, {}, [''])
133
134        if framework.mode == READWRITE:
135            if not os.path.exists(self.db_path):
136                os.makedirs(self.db_path)
137            mode = 'c'
138        else:
139            mode = 'r'
140
141        def dbopen(name):
142            return dbm.open(os.path.join(self.db_path, name), mode)
143
144        # wordid:set(uriid)
145        self.words = KeyedSet(dbopen('words'))
146        # uriid:set(wordid)
147        self.uris = KeyedSet(dbopen('uris'))
148        # uri:dict(attributes)
149        self.attributes = PickleDict(dbopen('attributes'))
150
151        if cache:
152            self.cachedb = PickleDict(dbopen('cache'))
153
154        if compact:
155            # id:word mapping
156            self.idword = PickleDict(dbopen('idword'))
157            # word:id mapping
158            self.wordid = PickleDict(dbopen('wordid'))
159            # key:value config
160            self.config = PickleDict(dbopen('config'))
161
162            self.config.setdefault('wordid', 0)
163        else:
164            self._words = self._wids = lambda w: set(map(unicode, w))
165            self._word = self._wid = lambda w: unicode(w)
166
167    def index(self, document):
168
169        uri = unicode(self._wid(document.uri))
170        words = self._wids(self.framework.reduce(document.content))
171        doc_set = set([uri])
172
173        if self.cache:
174            if self.compact:
175                # Do a low-compression gzip
176                buffer = StringIO()
177                gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb')
178                gz.write(document.content.encode('utf-8'))
179                gz.close()
180                self.cachedb[uri] = buffer.getvalue()
181            else:
182                self.cachedb[uri] = document.content
183
184        self.attributes[uri] = document.attributes
185
186        old_words = self.words.get(document.uri)
187        removed_words = old_words.difference(words)
188        new_words = words.difference(old_words)
189
190        # Clear out words that have been removed from the index
191        for word in removed_words:
192            self.words.remove(word, doc_set)
193
194        # Add new words
195        for word in new_words:
196            self.words.update(word, doc_set)
197
198        self.uris.replace(uri, words)
199
200    replace = index
201
202    def discard(self, uri):
203        uri = unicode(uri)
204        try:
205            del self.attributes[uri]
206        except KeyError:
207            pass
208
209        uri_set = set([uri])
210        for word in self.uris.get(uri):
211            self.uris.remove(word, uri_set)
212        self.uris.remove(uri)
213
214    def __iter__(self):
215        for uri in self.uris.keys():
216            yield URI(self._word(uri))
217
218    def fetch(self, uri):
219        uri = unicode(uri)
220        uriid = self._wid(uri)
221        attributes = self.attributes.get(uriid, {})
222        attributes = dict([(k.encode('utf-8'), v)
223                           for k, v in attributes.iteritems()])
224        attributes['uri'] = uri
225        if self.cache:
226            content = self.cachedb[uriid]
227            if self.compact:
228                gz = GzipFile(fileobj=StringIO(content), mode='rb')
229                content = gz.read().decode('utf-8')
230            quality = 0.99
231        else:
232            content = ' '.join(self._words(self.uris.get(uriid)))
233            quality = 0.1
234        return Document(content=content, quality=quality, **attributes)
235
236    def close(self):
237        self.words = None
238        self.uris = None
239        self.attributes = None
240        self.wordid = None
241        self.idword = None
242        self.config = None
243
244    def search(self, query):
245        query.reduce(self.framework.reduce)
246        # FIXME NOT is not supported yet
247        # FIXME Words without a WID can be automatically excluded from the
248        # search
249
250#        class invertedset(object):
251#            def __init__(self, items=[]):
252#                self.items = items
253#
254#            def intersection(self, other):
255#                self.
256
257        def visit(node):
258            if node.type == node.TERM:
259                return self.words.get(self._wid(node.value))
260            elif node.type == node.AND:
261                return visit(node.left).intersection(visit(node.right))
262            elif node.type == node.OR:
263                return visit(node.left).union(visit(node.right))
264            elif node.type == node.NOT:
265                raise NotImplementedError('NOT is ... not supported')
266                #return invertedset(visit(node.left))
267
268        uris = visit(query)
269        return BuiltinResult(self, query, list(self._words(uris)))
270
271    # Internal methods
272    def _wids(self, words):
273        """Convert a collection of words to a set of wids."""
274        out = set()
275        for word in words:
276            out.add(self._wid(unicode(word)))
277        return out
278
279    def _words(self, wids):
280        """Convert a collection of wids to words."""
281        out = set()
282        for wid in wids:
283            out.add(self.idword[wid])
284        return out
285
286    def _wid(self, word):
287        """Return, or allocate, a unique word identifier."""
288        word = unicode(word)
289        try:
290            return self.wordid[word]
291        except KeyError:
292            id = self.config['wordid']
293            self.config['wordid'] = id + 1
294            id = unicode(id)
295            self.wordid[word] = id
296            self.idword[id] = word
297            return id
298
299    def _word(self, wid):
300        return self.idword[wid]
301
302
303indexer_factory = PluginFactory(BuiltinIndexer, cache=bool, compact=bool)
304
305
306class BuiltinResult(Result):
307    def __iter__(self):
308        for uri in self.context:
309            yield self._translate(uri)
310
311    def __getitem__(self, index):
312        return self._translate(self.context[index])
313
314    def _translate(self, uri):
315        indexer = self.indexer
316        framework = indexer.framework
317        attributes = indexer.attributes.get(uri, {})
318        attributes['uri'] = URI(uri)
319        attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()])
320        return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes)
Note: See TracBrowser for help on using the browser.