| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 |
# |
|---|
| 5 |
# This software is licensed as described in the file COPYING, which |
|---|
| 6 |
# you should have received as part of this distribution. |
|---|
| 7 |
# |
|---|
| 8 |
|
|---|
| 9 |
""" |
|---|
| 10 |
Builtin Indexer |
|---|
| 11 |
--------------- |
|---|
| 12 |
|
|---|
| 13 |
The builtin Pyndexter indexer. |
|---|
| 14 |
|
|---|
| 15 |
Pyndexter provides a basic inverted index indexer. It does not currently |
|---|
| 16 |
support substring matching, wildcards, or scoring, but these features are |
|---|
| 17 |
planned. |
|---|
| 18 |
|
|---|
| 19 |
Usage |
|---|
| 20 |
~~~~~ |
|---|
| 21 |
|
|---|
| 22 |
:: |
|---|
| 23 |
|
|---|
| 24 |
builtin://<path>?compact=<bool>&cache=<bool>&dbm=<dbm> |
|---|
| 25 |
|
|---|
| 26 |
``compact=<bool>`` (default: ``true``) |
|---|
| 27 |
Whether to compact the database as much as possible. Slight slowdown. |
|---|
| 28 |
|
|---|
| 29 |
``cache=<bool>`` (default: ``false``) |
|---|
| 30 |
Should we keep a cached copy of each document as it is indexed? |
|---|
| 31 |
|
|---|
| 32 |
``dbm=<dbm>`` (default: ``anydbm``) |
|---|
| 33 |
Supported dbm's are ``anydbm``, ``dbhash``, ``gdbm`` and ``dbm`` (Python 2.5). |
|---|
| 34 |
|
|---|
| 35 |
Installation |
|---|
| 36 |
~~~~~~~~~~~~ |
|---|
| 37 |
|
|---|
| 38 |
No installation is required. Pyndexter uses the anydbm Python module for |
|---|
| 39 |
storage. |
|---|
| 40 |
""" |
|---|
| 41 |
|
|---|
| 42 |
import os |
|---|
| 43 |
import re |
|---|
| 44 |
import anydbm |
|---|
| 45 |
import cPickle as pickle |
|---|
| 46 |
import md5 |
|---|
| 47 |
from UserDict import DictMixin |
|---|
| 48 |
from StringIO import StringIO |
|---|
| 49 |
from gzip import GzipFile |
|---|
| 50 |
from pyndexter import * |
|---|
| 51 |
from pyndexter.util import set |
|---|
| 52 |
|
|---|
| 53 |
|
|---|
| 54 |
class KeyedSet(object): |
|---|
| 55 |
def __init__(self, db): |
|---|
| 56 |
self.db = db |
|---|
| 57 |
|
|---|
| 58 |
def update(self, key, values): |
|---|
| 59 |
key = pickle.dumps(key, 2) |
|---|
| 60 |
try: |
|---|
| 61 |
v = pickle.loads(self.db[key]) |
|---|
| 62 |
except KeyError: |
|---|
| 63 |
v = set() |
|---|
| 64 |
v.update(values) |
|---|
| 65 |
self.db[key] = pickle.dumps(v, 2) |
|---|
| 66 |
|
|---|
| 67 |
def remove(self, key, values=None): |
|---|
| 68 |
key = pickle.dumps(key, 2) |
|---|
| 69 |
if values is None: |
|---|
| 70 |
try: |
|---|
| 71 |
del self.db[key] |
|---|
| 72 |
except KeyError: |
|---|
| 73 |
pass |
|---|
| 74 |
else: |
|---|
| 75 |
try: |
|---|
| 76 |
v = pickle.loads(self.db[key]) |
|---|
| 77 |
v.remove(values) |
|---|
| 78 |
self.db[key] = pickle.dumps(v, 2) |
|---|
| 79 |
except KeyError: |
|---|
| 80 |
pass |
|---|
| 81 |
|
|---|
| 82 |
def replace(self, key, values): |
|---|
| 83 |
key = pickle.dumps(key, 2) |
|---|
| 84 |
self.db[key] = pickle.dumps(values, 2) |
|---|
| 85 |
|
|---|
| 86 |
def get(self, key): |
|---|
| 87 |
key = pickle.dumps(key, 2) |
|---|
| 88 |
try: |
|---|
| 89 |
return pickle.loads(self.db[key]) |
|---|
| 90 |
except KeyError: |
|---|
| 91 |
return set() |
|---|
| 92 |
|
|---|
| 93 |
def keys(self): |
|---|
| 94 |
for key in self.db.keys(): |
|---|
| 95 |
yield pickle.loads(key) |
|---|
| 96 |
|
|---|
| 97 |
|
|---|
| 98 |
class PickleDict(DictMixin): |
|---|
| 99 |
"""A dictionary wrapper that automatically pickles keys and values.""" |
|---|
| 100 |
def __init__(self, db): |
|---|
| 101 |
self.db = db |
|---|
| 102 |
|
|---|
| 103 |
def __getitem__(self, key): |
|---|
| 104 |
return pickle.loads(self.db[pickle.dumps(key, 2)]) |
|---|
| 105 |
|
|---|
| 106 |
def __setitem__(self, key, value): |
|---|
| 107 |
self.db[pickle.dumps(key, 2)] = pickle.dumps(value, 2) |
|---|
| 108 |
|
|---|
| 109 |
def __delitem__(self, key): |
|---|
| 110 |
del self.db[pickle.dumps(key, 2)] |
|---|
| 111 |
|
|---|
| 112 |
def keys(self): |
|---|
| 113 |
return [pickle.loads(k) for k in self.db.keys()] |
|---|
| 114 |
|
|---|
| 115 |
|
|---|
| 116 |
class BuiltinIndexer(Indexer): |
|---|
| 117 |
"""Builtin Pyndexter indexer.""" |
|---|
| 118 |
def __init__(self, framework, path, dbm='anydbm', cache=False, |
|---|
| 119 |
compact=True): |
|---|
| 120 |
Indexer.__init__(self, framework) |
|---|
| 121 |
|
|---|
| 122 |
self.path = path |
|---|
| 123 |
self.compact = compact |
|---|
| 124 |
self.cache = cache |
|---|
| 125 |
self.state_path = os.path.join(path, 'store.db') |
|---|
| 126 |
self.db_path = os.path.join(path, 'builtin.db') |
|---|
| 127 |
|
|---|
| 128 |
# We want the minimum set of words |
|---|
| 129 |
framework.reduce.split = True |
|---|
| 130 |
framework.reduce.unique = True |
|---|
| 131 |
|
|---|
| 132 |
dbm = __import__(dbm, {}, {}, ['']) |
|---|
| 133 |
|
|---|
| 134 |
if framework.mode == READWRITE: |
|---|
| 135 |
if not os.path.exists(self.db_path): |
|---|
| 136 |
os.makedirs(self.db_path) |
|---|
| 137 |
mode = 'c' |
|---|
| 138 |
else: |
|---|
| 139 |
mode = 'r' |
|---|
| 140 |
|
|---|
| 141 |
def dbopen(name): |
|---|
| 142 |
return dbm.open(os.path.join(self.db_path, name), mode) |
|---|
| 143 |
|
|---|
| 144 |
# wordid:set(uriid) |
|---|
| 145 |
self.words = KeyedSet(dbopen('words')) |
|---|
| 146 |
# uriid:set(wordid) |
|---|
| 147 |
self.uris = KeyedSet(dbopen('uris')) |
|---|
| 148 |
# uri:dict(attributes) |
|---|
| 149 |
self.attributes = PickleDict(dbopen('attributes')) |
|---|
| 150 |
|
|---|
| 151 |
if cache: |
|---|
| 152 |
self.cachedb = PickleDict(dbopen('cache')) |
|---|
| 153 |
|
|---|
| 154 |
if compact: |
|---|
| 155 |
# id:word mapping |
|---|
| 156 |
self.idword = PickleDict(dbopen('idword')) |
|---|
| 157 |
# word:id mapping |
|---|
| 158 |
self.wordid = PickleDict(dbopen('wordid')) |
|---|
| 159 |
# key:value config |
|---|
| 160 |
self.config = PickleDict(dbopen('config')) |
|---|
| 161 |
|
|---|
| 162 |
self.config.setdefault('wordid', 0) |
|---|
| 163 |
else: |
|---|
| 164 |
self._words = self._wids = lambda w: set(map(unicode, w)) |
|---|
| 165 |
self._word = self._wid = lambda w: unicode(w) |
|---|
| 166 |
|
|---|
| 167 |
def index(self, document): |
|---|
| 168 |
|
|---|
| 169 |
uri = unicode(self._wid(document.uri)) |
|---|
| 170 |
words = self._wids(self.framework.reduce(document.content)) |
|---|
| 171 |
doc_set = set([uri]) |
|---|
| 172 |
|
|---|
| 173 |
if self.cache: |
|---|
| 174 |
if self.compact: |
|---|
| 175 |
# Do a low-compression gzip |
|---|
| 176 |
buffer = StringIO() |
|---|
| 177 |
try: |
|---|
| 178 |
gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb') |
|---|
| 179 |
gz.write(document.content.encode('utf-8', 'ignore')) |
|---|
| 180 |
finally: |
|---|
| 181 |
gz.close() |
|---|
| 182 |
self.cachedb[uri] = buffer.getvalue() |
|---|
| 183 |
else: |
|---|
| 184 |
self.cachedb[uri] = document.content |
|---|
| 185 |
|
|---|
| 186 |
self.attributes[uri] = document.attributes |
|---|
| 187 |
|
|---|
| 188 |
old_words = self.words.get(document.uri) |
|---|
| 189 |
removed_words = old_words.difference(words) |
|---|
| 190 |
new_words = words.difference(old_words) |
|---|
| 191 |
|
|---|
| 192 |
# Clear out words that have been removed from the index |
|---|
| 193 |
for word in removed_words: |
|---|
| 194 |
self.words.remove(word, doc_set) |
|---|
| 195 |
|
|---|
| 196 |
# Add new words |
|---|
| 197 |
for word in new_words: |
|---|
| 198 |
self.words.update(word, doc_set) |
|---|
| 199 |
|
|---|
| 200 |
self.uris.replace(uri, words) |
|---|
| 201 |
|
|---|
| 202 |
replace = index |
|---|
| 203 |
|
|---|
| 204 |
def discard(self, uri): |
|---|
| 205 |
uri = unicode(uri) |
|---|
| 206 |
try: |
|---|
| 207 |
del self.attributes[uri] |
|---|
| 208 |
except KeyError: |
|---|
| 209 |
pass |
|---|
| 210 |
|
|---|
| 211 |
uri_set = set([uri]) |
|---|
| 212 |
for word in self.uris.get(uri): |
|---|
| 213 |
self.uris.remove(word, uri_set) |
|---|
| 214 |
self.uris.remove(uri) |
|---|
| 215 |
|
|---|
| 216 |
def __iter__(self): |
|---|
| 217 |
for uri in self.uris.keys(): |
|---|
| 218 |
yield URI(self._word(uri)) |
|---|
| 219 |
|
|---|
| 220 |
def fetch(self, uri): |
|---|
| 221 |
uri = unicode(uri) |
|---|
| 222 |
uriid = self._wid(uri) |
|---|
| 223 |
attributes = self.attributes.get(uriid, {}) |
|---|
| 224 |
attributes = dict([(k.encode('utf-8'), v) |
|---|
| 225 |
for k, v in attributes.iteritems()]) |
|---|
| 226 |
attributes['uri'] = uri |
|---|
| 227 |
if self.cache: |
|---|
| 228 |
content = self.cachedb[uriid] |
|---|
| 229 |
if self.compact: |
|---|
| 230 |
gz = GzipFile(fileobj=StringIO(content), mode='rb') |
|---|
| 231 |
content = gz.read().decode('utf-8', 'ignore') |
|---|
| 232 |
quality = 0.99 |
|---|
| 233 |
else: |
|---|
| 234 |
content = ' '.join(self._words(self.uris.get(uriid))) |
|---|
| 235 |
quality = 0.1 |
|---|
| 236 |
return Document(content=content, quality=quality, **attributes) |
|---|
| 237 |
|
|---|
| 238 |
def close(self): |
|---|
| 239 |
self.words = None |
|---|
| 240 |
self.uris = None |
|---|
| 241 |
self.attributes = None |
|---|
| 242 |
self.wordid = None |
|---|
| 243 |
self.idword = None |
|---|
| 244 |
self.config = None |
|---|
| 245 |
|
|---|
| 246 |
def search(self, query): |
|---|
| 247 |
query.reduce(self.framework.reduce) |
|---|
| 248 |
# FIXME NOT is not supported yet |
|---|
| 249 |
# FIXME Words without a WID can be automatically excluded from the |
|---|
| 250 |
# search |
|---|
| 251 |
|
|---|
| 252 |
# class invertedset(object): |
|---|
| 253 |
# def __init__(self, items=[]): |
|---|
| 254 |
# self.items = items |
|---|
| 255 |
# |
|---|
| 256 |
# def intersection(self, other): |
|---|
| 257 |
# self. |
|---|
| 258 |
|
|---|
| 259 |
def visit(node): |
|---|
| 260 |
if node.type == node.TERM: |
|---|
| 261 |
return self.words.get(self._wid(node.value)) |
|---|
| 262 |
elif node.type == node.AND: |
|---|
| 263 |
return visit(node.left).intersection(visit(node.right)) |
|---|
| 264 |
elif node.type == node.OR: |
|---|
| 265 |
return visit(node.left).union(visit(node.right)) |
|---|
| 266 |
elif node.type == node.NOT: |
|---|
| 267 |
raise NotImplementedError('NOT is ... not supported') |
|---|
| 268 |
#return invertedset(visit(node.left)) |
|---|
| 269 |
|
|---|
| 270 |
uris = visit(query) |
|---|
| 271 |
return BuiltinResult(self, query, list(self._words(uris))) |
|---|
| 272 |
|
|---|
| 273 |
# Internal methods |
|---|
| 274 |
def _wids(self, words): |
|---|
| 275 |
"""Convert a collection of words to a set of wids.""" |
|---|
| 276 |
out = set() |
|---|
| 277 |
for word in words: |
|---|
| 278 |
out.add(self._wid(unicode(word))) |
|---|
| 279 |
return out |
|---|
| 280 |
|
|---|
| 281 |
def _words(self, wids): |
|---|
| 282 |
"""Convert a collection of wids to words.""" |
|---|
| 283 |
out = set() |
|---|
| 284 |
for wid in wids: |
|---|
| 285 |
out.add(self.idword[wid]) |
|---|
| 286 |
return out |
|---|
| 287 |
|
|---|
| 288 |
def _wid(self, word): |
|---|
| 289 |
"""Return, or allocate, a unique word identifier.""" |
|---|
| 290 |
word = unicode(word) |
|---|
| 291 |
try: |
|---|
| 292 |
return self.wordid[word] |
|---|
| 293 |
except KeyError: |
|---|
| 294 |
id = self.config['wordid'] |
|---|
| 295 |
self.config['wordid'] = id + 1 |
|---|
| 296 |
id = unicode(id) |
|---|
| 297 |
self.wordid[word] = id |
|---|
| 298 |
self.idword[id] = word |
|---|
| 299 |
return id |
|---|
| 300 |
|
|---|
| 301 |
def _word(self, wid): |
|---|
| 302 |
return self.idword[wid] |
|---|
| 303 |
|
|---|
| 304 |
|
|---|
| 305 |
indexer_factory = PluginFactory(BuiltinIndexer, cache=bool, compact=bool) |
|---|
| 306 |
|
|---|
| 307 |
|
|---|
| 308 |
class BuiltinResult(Result): |
|---|
| 309 |
def __iter__(self): |
|---|
| 310 |
for uri in self.context: |
|---|
| 311 |
yield self._translate(uri) |
|---|
| 312 |
|
|---|
| 313 |
def __getitem__(self, index): |
|---|
| 314 |
return self._translate(self.context[index]) |
|---|
| 315 |
|
|---|
| 316 |
def _translate(self, uri): |
|---|
| 317 |
indexer = self.indexer |
|---|
| 318 |
framework = indexer.framework |
|---|
| 319 |
attributes = indexer.attributes.get(uri, {}) |
|---|
| 320 |
attributes['uri'] = URI(uri) |
|---|
| 321 |
attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()]) |
|---|
| 322 |
return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes) |
|---|