| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 | # |
|---|
| 5 | # This software is licensed as described in the file COPYING, which |
|---|
| 6 | # you should have received as part of this distribution. |
|---|
| 7 | # |
|---|
| 8 | |
|---|
| 9 | """ |
|---|
| 10 | Builtin Indexer |
|---|
| 11 | --------------- |
|---|
| 12 | |
|---|
| 13 | The builtin Pyndexter indexer. |
|---|
| 14 | |
|---|
| 15 | Pyndexter provides a basic inverted index indexer. It does not currently |
|---|
| 16 | support substring matching, wildcards, or scoring, but these features are |
|---|
| 17 | planned. |
|---|
| 18 | |
|---|
| 19 | Usage |
|---|
| 20 | ~~~~~ |
|---|
| 21 | |
|---|
| 22 | :: |
|---|
| 23 | |
|---|
| 24 | builtin://<path>?compact=<bool>&cache=<bool>&dbm=<dbm> |
|---|
| 25 | |
|---|
| 26 | ``compact=<bool>`` (default: ``true``) |
|---|
| 27 | Whether to compact the database as much as possible. Slight slowdown. |
|---|
| 28 | |
|---|
| 29 | ``cache=<bool>`` (default: ``false``) |
|---|
| 30 | Should we keep a cached copy of each document as it is indexed? |
|---|
| 31 | |
|---|
| 32 | ``dbm=<dbm>`` (default: ``anydbm``) |
|---|
| 33 | Supported dbm's are ``anydbm``, ``dbhash``, ``gdbm`` and ``dbm`` (Python 2.5). |
|---|
| 34 | |
|---|
| 35 | Installation |
|---|
| 36 | ~~~~~~~~~~~~ |
|---|
| 37 | |
|---|
| 38 | No installation is required. Pyndexter uses the anydbm Python module for |
|---|
| 39 | storage. |
|---|
| 40 | """ |
|---|
| 41 | |
|---|
| 42 | import os |
|---|
| 43 | import re |
|---|
| 44 | import anydbm |
|---|
| 45 | import cPickle as pickle |
|---|
| 46 | import md5 |
|---|
| 47 | from UserDict import DictMixin |
|---|
| 48 | from StringIO import StringIO |
|---|
| 49 | from gzip import GzipFile |
|---|
| 50 | from pyndexter import * |
|---|
| 51 | from pyndexter.util import set |
|---|
| 52 | |
|---|
| 53 | |
|---|
| 54 | class KeyedSet(object): |
|---|
| 55 | def __init__(self, db): |
|---|
| 56 | self.db = db |
|---|
| 57 | |
|---|
| 58 | def update(self, key, values): |
|---|
| 59 | key = pickle.dumps(key, 2) |
|---|
| 60 | try: |
|---|
| 61 | v = pickle.loads(self.db[key]) |
|---|
| 62 | except KeyError: |
|---|
| 63 | v = set() |
|---|
| 64 | v.update(values) |
|---|
| 65 | self.db[key] = pickle.dumps(v, 2) |
|---|
| 66 | |
|---|
| 67 | def remove(self, key, values=None): |
|---|
| 68 | key = pickle.dumps(key, 2) |
|---|
| 69 | if values is None: |
|---|
| 70 | try: |
|---|
| 71 | del self.db[key] |
|---|
| 72 | except KeyError: |
|---|
| 73 | pass |
|---|
| 74 | else: |
|---|
| 75 | try: |
|---|
| 76 | v = pickle.loads(self.db[key]) |
|---|
| 77 | v.remove(values) |
|---|
| 78 | self.db[key] = pickle.dumps(v, 2) |
|---|
| 79 | except KeyError: |
|---|
| 80 | pass |
|---|
| 81 | |
|---|
| 82 | def replace(self, key, values): |
|---|
| 83 | key = pickle.dumps(key, 2) |
|---|
| 84 | self.db[key] = pickle.dumps(values, 2) |
|---|
| 85 | |
|---|
| 86 | def get(self, key): |
|---|
| 87 | key = pickle.dumps(key, 2) |
|---|
| 88 | try: |
|---|
| 89 | return pickle.loads(self.db[key]) |
|---|
| 90 | except KeyError: |
|---|
| 91 | return set() |
|---|
| 92 | |
|---|
| 93 | def keys(self): |
|---|
| 94 | for key in self.db.keys(): |
|---|
| 95 | yield pickle.loads(key) |
|---|
| 96 | |
|---|
| 97 | |
|---|
| 98 | class PickleDict(DictMixin): |
|---|
| 99 | """A dictionary wrapper that automatically pickles keys and values.""" |
|---|
| 100 | def __init__(self, db): |
|---|
| 101 | self.db = db |
|---|
| 102 | |
|---|
| 103 | def __getitem__(self, key): |
|---|
| 104 | return pickle.loads(self.db[pickle.dumps(key, 2)]) |
|---|
| 105 | |
|---|
| 106 | def __setitem__(self, key, value): |
|---|
| 107 | self.db[pickle.dumps(key, 2)] = pickle.dumps(value, 2) |
|---|
| 108 | |
|---|
| 109 | def __delitem__(self, key): |
|---|
| 110 | del self.db[pickle.dumps(key, 2)] |
|---|
| 111 | |
|---|
| 112 | def keys(self): |
|---|
| 113 | return [pickle.loads(k) for k in self.db.keys()] |
|---|
| 114 | |
|---|
| 115 | |
|---|
| 116 | class BuiltinIndexer(Indexer): |
|---|
| 117 | """Builtin Pyndexter indexer.""" |
|---|
| 118 | def __init__(self, framework, path, dbm='anydbm', cache=False, |
|---|
| 119 | compact=True): |
|---|
| 120 | Indexer.__init__(self, framework) |
|---|
| 121 | |
|---|
| 122 | self.path = path |
|---|
| 123 | self.compact = compact |
|---|
| 124 | self.cache = cache |
|---|
| 125 | self.state_path = os.path.join(path, 'store.db') |
|---|
| 126 | self.db_path = os.path.join(path, 'builtin.db') |
|---|
| 127 | |
|---|
| 128 | # We want the minimum set of words |
|---|
| 129 | framework.reduce.split = True |
|---|
| 130 | framework.reduce.unique = True |
|---|
| 131 | |
|---|
| 132 | dbm = __import__(dbm, {}, {}, ['']) |
|---|
| 133 | |
|---|
| 134 | if framework.mode == READWRITE: |
|---|
| 135 | if not os.path.exists(self.db_path): |
|---|
| 136 | os.makedirs(self.db_path) |
|---|
| 137 | mode = 'c' |
|---|
| 138 | else: |
|---|
| 139 | mode = 'r' |
|---|
| 140 | |
|---|
| 141 | def dbopen(name): |
|---|
| 142 | return dbm.open(os.path.join(self.db_path, name), mode) |
|---|
| 143 | |
|---|
| 144 | # wordid:set(uriid) |
|---|
| 145 | self.words = KeyedSet(dbopen('words')) |
|---|
| 146 | # uriid:set(wordid) |
|---|
| 147 | self.uris = KeyedSet(dbopen('uris')) |
|---|
| 148 | # uri:dict(attributes) |
|---|
| 149 | self.attributes = PickleDict(dbopen('attributes')) |
|---|
| 150 | |
|---|
| 151 | if cache: |
|---|
| 152 | self.cachedb = PickleDict(dbopen('cache')) |
|---|
| 153 | |
|---|
| 154 | if compact: |
|---|
| 155 | # id:word mapping |
|---|
| 156 | self.idword = PickleDict(dbopen('idword')) |
|---|
| 157 | # word:id mapping |
|---|
| 158 | self.wordid = PickleDict(dbopen('wordid')) |
|---|
| 159 | # key:value config |
|---|
| 160 | self.config = PickleDict(dbopen('config')) |
|---|
| 161 | |
|---|
| 162 | self.config.setdefault('wordid', 0) |
|---|
| 163 | else: |
|---|
| 164 | self._words = self._wids = lambda w: set(map(unicode, w)) |
|---|
| 165 | self._word = self._wid = lambda w: unicode(w) |
|---|
| 166 | |
|---|
| 167 | def index(self, document): |
|---|
| 168 | |
|---|
| 169 | uri = unicode(self._wid(document.uri)) |
|---|
| 170 | words = self._wids(self.framework.reduce(document.content)) |
|---|
| 171 | doc_set = set([uri]) |
|---|
| 172 | |
|---|
| 173 | if self.cache: |
|---|
| 174 | if self.compact: |
|---|
| 175 | # Do a low-compression gzip |
|---|
| 176 | buffer = StringIO() |
|---|
| 177 | gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb') |
|---|
| 178 | gz.write(document.content.encode('utf-8')) |
|---|
| 179 | gz.close() |
|---|
| 180 | self.cachedb[uri] = buffer.getvalue() |
|---|
| 181 | else: |
|---|
| 182 | self.cachedb[uri] = document.content |
|---|
| 183 | |
|---|
| 184 | self.attributes[uri] = document.attributes |
|---|
| 185 | |
|---|
| 186 | old_words = self.words.get(document.uri) |
|---|
| 187 | removed_words = old_words.difference(words) |
|---|
| 188 | new_words = words.difference(old_words) |
|---|
| 189 | |
|---|
| 190 | # Clear out words that have been removed from the index |
|---|
| 191 | for word in removed_words: |
|---|
| 192 | self.words.remove(word, doc_set) |
|---|
| 193 | |
|---|
| 194 | # Add new words |
|---|
| 195 | for word in new_words: |
|---|
| 196 | self.words.update(word, doc_set) |
|---|
| 197 | |
|---|
| 198 | self.uris.replace(uri, words) |
|---|
| 199 | |
|---|
| 200 | replace = index |
|---|
| 201 | |
|---|
| 202 | def discard(self, uri): |
|---|
| 203 | uri = unicode(uri) |
|---|
| 204 | try: |
|---|
| 205 | del self.attributes[uri] |
|---|
| 206 | except KeyError: |
|---|
| 207 | pass |
|---|
| 208 | |
|---|
| 209 | uri_set = set([uri]) |
|---|
| 210 | for word in self.uris.get(uri): |
|---|
| 211 | self.uris.remove(word, uri_set) |
|---|
| 212 | self.uris.remove(uri) |
|---|
| 213 | |
|---|
| 214 | def __iter__(self): |
|---|
| 215 | for uri in self.uris.keys(): |
|---|
| 216 | yield URI(self._word(uri)) |
|---|
| 217 | |
|---|
| 218 | def fetch(self, uri): |
|---|
| 219 | uri = unicode(uri) |
|---|
| 220 | uriid = self._wid(uri) |
|---|
| 221 | attributes = self.attributes.get(uriid, {}) |
|---|
| 222 | attributes = dict([(k.encode('utf-8'), v) |
|---|
| 223 | for k, v in attributes.iteritems()]) |
|---|
| 224 | attributes['uri'] = uri |
|---|
| 225 | if self.cache: |
|---|
| 226 | content = self.cachedb[uriid] |
|---|
| 227 | if self.compact: |
|---|
| 228 | gz = GzipFile(fileobj=StringIO(content), mode='rb') |
|---|
| 229 | content = gz.read().decode('utf-8') |
|---|
| 230 | quality = 0.99 |
|---|
| 231 | else: |
|---|
| 232 | content = ' '.join(self._words(self.uris.get(uriid))) |
|---|
| 233 | quality = 0.1 |
|---|
| 234 | return Document(content=content, quality=quality, **attributes) |
|---|
| 235 | |
|---|
| 236 | def close(self): |
|---|
| 237 | self.words = None |
|---|
| 238 | self.uris = None |
|---|
| 239 | self.attributes = None |
|---|
| 240 | self.wordid = None |
|---|
| 241 | self.idword = None |
|---|
| 242 | self.config = None |
|---|
| 243 | |
|---|
| 244 | def search(self, query): |
|---|
| 245 | query.reduce(self.framework.reduce) |
|---|
| 246 | # FIXME NOT is not supported yet |
|---|
| 247 | # FIXME Words without a WID can be automatically excluded from the |
|---|
| 248 | # search |
|---|
| 249 | |
|---|
| 250 | # class invertedset(object): |
|---|
| 251 | # def __init__(self, items=[]): |
|---|
| 252 | # self.items = items |
|---|
| 253 | # |
|---|
| 254 | # def intersection(self, other): |
|---|
| 255 | # self. |
|---|
| 256 | |
|---|
| 257 | def visit(node): |
|---|
| 258 | if node.type == node.TERM: |
|---|
| 259 | return self.words.get(self._wid(node.value)) |
|---|
| 260 | elif node.type == node.AND: |
|---|
| 261 | return visit(node.left).intersection(visit(node.right)) |
|---|
| 262 | elif node.type == node.OR: |
|---|
| 263 | return visit(node.left).union(visit(node.right)) |
|---|
| 264 | elif node.type == node.NOT: |
|---|
| 265 | raise NotImplementedError('NOT is ... not supported') |
|---|
| 266 | #return invertedset(visit(node.left)) |
|---|
| 267 | |
|---|
| 268 | uris = visit(query) |
|---|
| 269 | return BuiltinResult(self, query, list(self._words(uris))) |
|---|
| 270 | |
|---|
| 271 | # Internal methods |
|---|
| 272 | def _wids(self, words): |
|---|
| 273 | """Convert a collection of words to a set of wids.""" |
|---|
| 274 | out = set() |
|---|
| 275 | for word in words: |
|---|
| 276 | out.add(self._wid(unicode(word))) |
|---|
| 277 | return out |
|---|
| 278 | |
|---|
| 279 | def _words(self, wids): |
|---|
| 280 | """Convert a collection of wids to words.""" |
|---|
| 281 | out = set() |
|---|
| 282 | for wid in wids: |
|---|
| 283 | out.add(self.idword[wid]) |
|---|
| 284 | return out |
|---|
| 285 | |
|---|
| 286 | def _wid(self, word): |
|---|
| 287 | """Return, or allocate, a unique word identifier.""" |
|---|
| 288 | word = unicode(word) |
|---|
| 289 | try: |
|---|
| 290 | return self.wordid[word] |
|---|
| 291 | except KeyError: |
|---|
| 292 | id = self.config['wordid'] |
|---|
| 293 | self.config['wordid'] = id + 1 |
|---|
| 294 | id = unicode(id) |
|---|
| 295 | self.wordid[word] = id |
|---|
| 296 | self.idword[id] = word |
|---|
| 297 | return id |
|---|
| 298 | |
|---|
| 299 | def _word(self, wid): |
|---|
| 300 | return self.idword[wid] |
|---|
| 301 | |
|---|
| 302 | |
|---|
| 303 | indexer_factory = PluginFactory(BuiltinIndexer, cache=bool, compact=bool) |
|---|
| 304 | |
|---|
| 305 | |
|---|
| 306 | class BuiltinResult(Result): |
|---|
| 307 | def __iter__(self): |
|---|
| 308 | for uri in self.context: |
|---|
| 309 | yield self._translate(uri) |
|---|
| 310 | |
|---|
| 311 | def __getitem__(self, index): |
|---|
| 312 | return self._translate(self.context[index]) |
|---|
| 313 | |
|---|
| 314 | def _translate(self, uri): |
|---|
| 315 | indexer = self.indexer |
|---|
| 316 | framework = indexer.framework |
|---|
| 317 | attributes = indexer.attributes.get(uri, {}) |
|---|
| 318 | attributes['uri'] = URI(uri) |
|---|
| 319 | attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()]) |
|---|
| 320 | return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes) |
|---|