root/pyndexter/trunk/pyndexter/indexers/_builtin.py

Revision 452, 9.2 kB (checked in by athomas, 1 year ago)

pyndexter: All modules are now prefixed with _ to avoid import collisions. Updated unit tests.

  • Property svn:eol-style set to native
Line 
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4 #
5 # This software is licensed as described in the file COPYING, which
6 # you should have received as part of this distribution.
7 #
8
9 """
10 Builtin Indexer
11 ---------------
12
13 The builtin Pyndexter indexer.
14
15 Pyndexter provides a basic inverted index indexer. It does not currently
16 support substring matching, wildcards, or scoring, but these features are
17 planned.
18
19 Usage
20 ~~~~~
21
22 ::
23
24     builtin://<path>?compact=<bool>&cache=<bool>&dbm=<dbm>
25
26 ``compact=<bool>`` (default: ``true``)
27     Whether to compact the database as much as possible. Slight slowdown.
28
29 ``cache=<bool>`` (default: ``false``)
30     Should we keep a cached copy of each document as it is indexed?
31
32 ``dbm=<dbm>`` (default: ``anydbm``)
33     Supported dbm's are ``anydbm``, ``dbhash``, ``gdbm`` and ``dbm`` (Python 2.5).
34
35 Installation
36 ~~~~~~~~~~~~
37
38 No installation is required. Pyndexter uses the anydbm Python module for
39 storage.
40 """
41
42 import os
43 import re
44 import anydbm
45 import cPickle as pickle
46 import md5
47 from UserDict import DictMixin
48 from StringIO import StringIO
49 from gzip import GzipFile
50 from pyndexter import *
51 from pyndexter.util import set
52
53
54 class KeyedSet(object):
55     def __init__(self, db):
56         self.db = db
57
58     def update(self, key, values):
59         key = pickle.dumps(key, 2)
60         try:
61             v = pickle.loads(self.db[key])
62         except KeyError:
63             v = set()
64         v.update(values)
65         self.db[key] = pickle.dumps(v, 2)
66
67     def remove(self, key, values=None):
68         key = pickle.dumps(key, 2)
69         if values is None:
70             try:
71                 del self.db[key]
72             except KeyError:
73                 pass
74         else:
75             try:
76                 v = pickle.loads(self.db[key])
77                 v.remove(values)
78                 self.db[key] = pickle.dumps(v, 2)
79             except KeyError:
80                 pass
81
82     def replace(self, key, values):
83         key = pickle.dumps(key, 2)
84         self.db[key] = pickle.dumps(values, 2)
85
86     def get(self, key):
87         key = pickle.dumps(key, 2)
88         try:
89             return pickle.loads(self.db[key])
90         except KeyError:
91             return set()
92
93     def keys(self):
94         for key in self.db.keys():
95             yield pickle.loads(key)
96
97
98 class PickleDict(DictMixin):
99     """A dictionary wrapper that automatically pickles keys and values."""
100     def __init__(self, db):
101         self.db = db
102
103     def __getitem__(self, key):
104         return pickle.loads(self.db[pickle.dumps(key, 2)])
105
106     def __setitem__(self, key, value):
107         self.db[pickle.dumps(key, 2)] = pickle.dumps(value, 2)
108
109     def __delitem__(self, key):
110         del self.db[pickle.dumps(key, 2)]
111
112     def keys(self):
113         return [pickle.loads(k) for k in self.db.keys()]
114
115
116 class BuiltinIndexer(Indexer):
117     """Builtin Pyndexter indexer."""
118     def __init__(self, framework, path, dbm='anydbm', cache=False,
119                  compact=True):
120         Indexer.__init__(self, framework)
121
122         self.path = path
123         self.compact = compact
124         self.cache = cache
125         self.state_path = os.path.join(path, 'store.db')
126         self.db_path = os.path.join(path, 'builtin.db')
127
128         # We want the minimum set of words
129         framework.reduce.split = True
130         framework.reduce.unique = True
131
132         dbm = __import__(dbm, {}, {}, [''])
133
134         if framework.mode == READWRITE:
135             if not os.path.exists(self.db_path):
136                 os.makedirs(self.db_path)
137             mode = 'c'
138         else:
139             mode = 'r'
140
141         def dbopen(name):
142             return dbm.open(os.path.join(self.db_path, name), mode)
143
144         # wordid:set(uriid)
145         self.words = KeyedSet(dbopen('words'))
146         # uriid:set(wordid)
147         self.uris = KeyedSet(dbopen('uris'))
148         # uri:dict(attributes)
149         self.attributes = PickleDict(dbopen('attributes'))
150
151         if cache:
152             self.cachedb = PickleDict(dbopen('cache'))
153
154         if compact:
155             # id:word mapping
156             self.idword = PickleDict(dbopen('idword'))
157             # word:id mapping
158             self.wordid = PickleDict(dbopen('wordid'))
159             # key:value config
160             self.config = PickleDict(dbopen('config'))
161
162             self.config.setdefault('wordid', 0)
163         else:
164             self._words = self._wids = lambda w: set(map(unicode, w))
165             self._word = self._wid = lambda w: unicode(w)
166
167     def index(self, document):
168
169         uri = unicode(self._wid(document.uri))
170         words = self._wids(self.framework.reduce(document.content))
171         doc_set = set([uri])
172
173         if self.cache:
174             if self.compact:
175                 # Do a low-compression gzip
176                 buffer = StringIO()
177                 try:
178                     gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb')
179                     gz.write(document.content.encode('utf-8', 'ignore'))
180                 finally:
181                     gz.close()
182                 self.cachedb[uri] = buffer.getvalue()
183             else:
184                 self.cachedb[uri] = document.content
185
186         self.attributes[uri] = document.attributes
187
188         old_words = self.words.get(document.uri)
189         removed_words = old_words.difference(words)
190         new_words = words.difference(old_words)
191
192         # Clear out words that have been removed from the index
193         for word in removed_words:
194             self.words.remove(word, doc_set)
195
196         # Add new words
197         for word in new_words:
198             self.words.update(word, doc_set)
199
200         self.uris.replace(uri, words)
201
202     replace = index
203
204     def discard(self, uri):
205         uri = unicode(uri)
206         try:
207             del self.attributes[uri]
208         except KeyError:
209             pass
210
211         uri_set = set([uri])
212         for word in self.uris.get(uri):
213             self.uris.remove(word, uri_set)
214         self.uris.remove(uri)
215
216     def __iter__(self):
217         for uri in self.uris.keys():
218             yield URI(self._word(uri))
219
220     def fetch(self, uri):
221         uri = unicode(uri)
222         uriid = self._wid(uri)
223         attributes = self.attributes.get(uriid, {})
224         attributes = dict([(k.encode('utf-8'), v)
225                            for k, v in attributes.iteritems()])
226         attributes['uri'] = uri
227         if self.cache:
228             content = self.cachedb[uriid]
229             if self.compact:
230                 gz = GzipFile(fileobj=StringIO(content), mode='rb')
231                 content = gz.read().decode('utf-8', 'ignore')
232             quality = 0.99
233         else:
234             content = ' '.join(self._words(self.uris.get(uriid)))
235             quality = 0.1
236         return Document(content=content, quality=quality, **attributes)
237
238     def close(self):
239         self.words = None
240         self.uris = None
241         self.attributes = None
242         self.wordid = None
243         self.idword = None
244         self.config = None
245
246     def search(self, query):
247         query.reduce(self.framework.reduce)
248         # FIXME NOT is not supported yet
249         # FIXME Words without a WID can be automatically excluded from the
250         # search
251
252 #        class invertedset(object):
253 #            def __init__(self, items=[]):
254 #                self.items = items
255 #
256 #            def intersection(self, other):
257 #                self.
258
259         def visit(node):
260             if node.type == node.TERM:
261                 return self.words.get(self._wid(node.value))
262             elif node.type == node.AND:
263                 return visit(node.left).intersection(visit(node.right))
264             elif node.type == node.OR:
265                 return visit(node.left).union(visit(node.right))
266             elif node.type == node.NOT:
267                 raise NotImplementedError('NOT is ... not supported')
268                 #return invertedset(visit(node.left))
269
270         uris = visit(query)
271         return BuiltinResult(self, query, list(self._words(uris)))
272
273     # Internal methods
274     def _wids(self, words):
275         """Convert a collection of words to a set of wids."""
276         out = set()
277         for word in words:
278             out.add(self._wid(unicode(word)))
279         return out
280
281     def _words(self, wids):
282         """Convert a collection of wids to words."""
283         out = set()
284         for wid in wids:
285             out.add(self.idword[wid])
286         return out
287
288     def _wid(self, word):
289         """Return, or allocate, a unique word identifier."""
290         word = unicode(word)
291         try:
292             return self.wordid[word]
293         except KeyError:
294             id = self.config['wordid']
295             self.config['wordid'] = id + 1
296             id = unicode(id)
297             self.wordid[word] = id
298             self.idword[id] = word
299             return id
300
301     def _word(self, wid):
302         return self.idword[wid]
303
304
305 indexer_factory = PluginFactory(BuiltinIndexer, cache=bool, compact=bool)
306
307
308 class BuiltinResult(Result):
309     def __iter__(self):
310         for uri in self.context:
311             yield self._translate(uri)
312
313     def __getitem__(self, index):
314         return self._translate(self.context[index])
315
316     def _translate(self, uri):
317         indexer = self.indexer
318         framework = indexer.framework
319         attributes = indexer.attributes.get(uri, {})
320         attributes['uri'] = URI(uri)
321         attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()])
322         return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes)
Note: See TracBrowser for help on using the browser.