| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 |
# |
|---|
| 5 |
# This software is licensed as described in the file COPYING, which |
|---|
| 6 |
# you should have received as part of this distribution. |
|---|
| 7 |
# |
|---|
| 8 |
|
|---|
| 9 |
""" |
|---|
| 10 |
Lupy |
|---|
| 11 |
---- |
|---|
| 12 |
|
|---|
| 13 |
Lupy_ is a (deprecated) pure-Python indexer. It is excruciatingly slow, |
|---|
| 14 |
presumably because of its desire to be compatible with Lucene. Included |
|---|
| 15 |
as an excercise mostly :) |
|---|
| 16 |
|
|---|
| 17 |
.. _Lupy: http://www.divmod.org/projects/lupy |
|---|
| 18 |
|
|---|
| 19 |
Usage |
|---|
| 20 |
~~~~~ |
|---|
| 21 |
|
|---|
| 22 |
:: |
|---|
| 23 |
|
|---|
| 24 |
lupy://<path> |
|---|
| 25 |
|
|---|
| 26 |
Installation |
|---|
| 27 |
~~~~~~~~~~~~ |
|---|
| 28 |
|
|---|
| 29 |
:: |
|---|
| 30 |
|
|---|
| 31 |
easy_install http://gentoo.prz.rzeszow.pl/distfiles/Lupy-0.2.1.tar.gz |
|---|
| 32 |
""" |
|---|
| 33 |
|
|---|
| 34 |
import os |
|---|
| 35 |
from pyndexter import * |
|---|
| 36 |
import lupy |
|---|
| 37 |
import lupy.indexer |
|---|
| 38 |
import lupy.search |
|---|
| 39 |
|
|---|
| 40 |
|
|---|
| 41 |
class LupyIndexer(Indexer): |
|---|
| 42 |
def __init__(self, framework, path): |
|---|
| 43 |
Indexer.__init__(self, framework) |
|---|
| 44 |
self.path = path |
|---|
| 45 |
self.db_path = os.path.join(self.path, 'lupy.db').encode('utf-8') |
|---|
| 46 |
self.state_path = os.path.join(self.path, 'state.db') |
|---|
| 47 |
if framework.mode == READWRITE and not os.path.exists(self.path): |
|---|
| 48 |
os.makedirs(self.path) |
|---|
| 49 |
self.db = lupy.indexer.Index(self.db_path, |
|---|
| 50 |
create=framework.mode == \ |
|---|
| 51 |
READWRITE and not os.path.exists(self.db_path)) |
|---|
| 52 |
|
|---|
| 53 |
|
|---|
| 54 |
def index(self, document): |
|---|
| 55 |
attributes = dict([('_' + k.encode('utf-8'), unicode(v)) |
|---|
| 56 |
for k, v in document.attributes.iteritems() |
|---|
| 57 |
if v is not None]) |
|---|
| 58 |
self.discard(uri=document.uri) |
|---|
| 59 |
self.db.index(text=document.content, **attributes) |
|---|
| 60 |
|
|---|
| 61 |
def discard(self, uri): |
|---|
| 62 |
self.db.delete(uri=unicode(uri)) |
|---|
| 63 |
|
|---|
| 64 |
def search(self, query): |
|---|
| 65 |
lupy_query = lupy.indexer.BooleanQuery() |
|---|
| 66 |
self._compile_query(query, (True, False), lupy_query) |
|---|
| 67 |
searcher = lupy.search.indexsearcher.IndexSearcher(self.db_path) |
|---|
| 68 |
hits = searcher.search(lupy_query) |
|---|
| 69 |
return LupyResult(self, query, hits) |
|---|
| 70 |
|
|---|
| 71 |
def optimise(self): |
|---|
| 72 |
self.db.optimize() |
|---|
| 73 |
|
|---|
| 74 |
def close(self): |
|---|
| 75 |
self.db.close() |
|---|
| 76 |
|
|---|
| 77 |
# Internal methods |
|---|
| 78 |
def _compile_query(self, node, op, query): |
|---|
| 79 |
if not node or node.type == node.NULL: |
|---|
| 80 |
return |
|---|
| 81 |
if node.type == node.AND: |
|---|
| 82 |
self._compile_query(node.left, (True, False), query) |
|---|
| 83 |
self._compile_query(node.right, (True, False), query) |
|---|
| 84 |
elif node.type == node.OR: |
|---|
| 85 |
self._compile_query(node.left, (False, False), query) |
|---|
| 86 |
self._compile_query(node.right, (False, False), query) |
|---|
| 87 |
elif node.type == node.NOT: |
|---|
| 88 |
self._compile_query(node.left, (False, True), query) |
|---|
| 89 |
elif node.type == node.TERM: |
|---|
| 90 |
query.add(lupy.indexer.TermQuery(lupy.indexer.Term('text', node.value)), *op) |
|---|
| 91 |
else: |
|---|
| 92 |
raise NotImplementedError |
|---|
| 93 |
|
|---|
| 94 |
|
|---|
| 95 |
indexer_factory = PluginFactory(LupyIndexer) |
|---|
| 96 |
|
|---|
| 97 |
class LupyResult(Result): |
|---|
| 98 |
def __iter__(self): |
|---|
| 99 |
for index, doc in enumerate(self.context): |
|---|
| 100 |
yield self._translate(index, doc) |
|---|
| 101 |
|
|---|
| 102 |
def __getitem__(self, index): |
|---|
| 103 |
return self._translate(index, self.context[index]) |
|---|
| 104 |
|
|---|
| 105 |
# Internal methods |
|---|
| 106 |
def _translate(self, index, doc): |
|---|
| 107 |
fields = dict([(str(k), doc.get(k)) for k in doc.fieldNames]) |
|---|
| 108 |
fields['score'] = self.context.score(index) |
|---|
| 109 |
fields['uri'] = URI(fields['uri']) |
|---|
| 110 |
return Hit(current=self.indexer.framework.fetch, |
|---|
| 111 |
indexed=self.indexer.fetch, **fields) |
|---|