| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 |
# |
|---|
| 5 |
# This software is licensed as described in the file COPYING, which |
|---|
| 6 |
# you should have received as part of this distribution. |
|---|
| 7 |
# |
|---|
| 8 |
|
|---|
| 9 |
""" |
|---|
| 10 |
Lucene |
|---|
| 11 |
------ |
|---|
| 12 |
|
|---|
| 13 |
The Lucene adapter relies on PyLucene_, which is a Swig interface to a gcj |
|---|
| 14 |
compiled version of Java Lucene. |
|---|
| 15 |
|
|---|
| 16 |
PyLucene is good, but there are some serious compatibility issues with Python |
|---|
| 17 |
threading due to Java threading wanting to be the only implementation running. |
|---|
| 18 |
|
|---|
| 19 |
Usage |
|---|
| 20 |
~~~~~ |
|---|
| 21 |
|
|---|
| 22 |
:: |
|---|
| 23 |
|
|---|
| 24 |
lucene://<path> |
|---|
| 25 |
|
|---|
| 26 |
Installation |
|---|
| 27 |
~~~~~~~~~~~~ |
|---|
| 28 |
|
|---|
| 29 |
PyLucene_ is quite difficult to install. Either use your distributions |
|---|
| 30 |
packaging system or, if you're brave, attempt a source installation. Beyond the |
|---|
| 31 |
scope of this hint. |
|---|
| 32 |
|
|---|
| 33 |
.. _PyLucene: http://pylucene.osafoundation.org/ |
|---|
| 34 |
|
|---|
| 35 |
""" |
|---|
| 36 |
|
|---|
| 37 |
import os |
|---|
| 38 |
import PyLucene |
|---|
| 39 |
from pyndexter import * |
|---|
| 40 |
|
|---|
| 41 |
class LuceneIndexer(Indexer): |
|---|
| 42 |
def __init__(self, framework, path): |
|---|
| 43 |
Indexer.__init__(self, framework) |
|---|
| 44 |
self.path = path |
|---|
| 45 |
self.db_path = os.path.join(path, 'lucene.db') |
|---|
| 46 |
self.state_path = os.path.join(path, 'store.db') |
|---|
| 47 |
|
|---|
| 48 |
create = not os.path.exists(self.db_path) and framework.mode == READWRITE |
|---|
| 49 |
self.lucene_store = PyLucene.FSDirectory.getDirectory(self.db_path, create) |
|---|
| 50 |
self.analyzer = PyLucene.StandardAnalyzer() |
|---|
| 51 |
|
|---|
| 52 |
if framework.mode == READWRITE: |
|---|
| 53 |
self.writer = PyLucene.IndexWriter(self.lucene_store, self.analyzer, create) |
|---|
| 54 |
self.writer.setMaxFieldLength(1048576) # ?? |
|---|
| 55 |
else: |
|---|
| 56 |
self.writer = None |
|---|
| 57 |
|
|---|
| 58 |
def index(self, document): |
|---|
| 59 |
doc = PyLucene.Document() |
|---|
| 60 |
for k, v in document.attributes.iteritems(): |
|---|
| 61 |
doc.add(PyLucene.Field(unicode(k), unicode(v), |
|---|
| 62 |
PyLucene.Field.Store.YES, |
|---|
| 63 |
PyLucene.Field.Index.TOKENIZED)) |
|---|
| 64 |
reader = PyLucene.StringReader(document.content) |
|---|
| 65 |
doc.add(PyLucene.Field('content', reader)) |
|---|
| 66 |
self.writer.addDocument(doc) |
|---|
| 67 |
|
|---|
| 68 |
def discard(self, uri): |
|---|
| 69 |
reader = PyLucene.IndexReader.open(self.db_path) |
|---|
| 70 |
reader.deleteDocuments(PyLucene.Term('uri', unicode(uri))) |
|---|
| 71 |
reader.close() |
|---|
| 72 |
|
|---|
| 73 |
def search(self, query): |
|---|
| 74 |
lq = query.as_string() |
|---|
| 75 |
searcher = PyLucene.IndexSearcher(self.lucene_store) |
|---|
| 76 |
lq = PyLucene.QueryParser('content', self.analyzer).parse(lq) |
|---|
| 77 |
#sort_field = PyLucene.SortField('RELEVANCE', False) |
|---|
| 78 |
#sort = PyLucene.Sort(sort_field) |
|---|
| 79 |
|
|---|
| 80 |
# TODO This is causing a segfault?!?! |
|---|
| 81 |
#sort = PyLucene.Sort.INDEXORDER |
|---|
| 82 |
#search = searcher.search(query, sort) |
|---|
| 83 |
search = searcher.search(lq) |
|---|
| 84 |
return LuceneResult(self, query, search) |
|---|
| 85 |
|
|---|
| 86 |
def optimise(self): |
|---|
| 87 |
self.writer.optimize() |
|---|
| 88 |
|
|---|
| 89 |
def flush(self): |
|---|
| 90 |
try: |
|---|
| 91 |
# XXX Assume this will make it into the Lucene bindings |
|---|
| 92 |
self.writer.flush() |
|---|
| 93 |
except AttributeError: |
|---|
| 94 |
pass |
|---|
| 95 |
|
|---|
| 96 |
def close(self): |
|---|
| 97 |
if self.writer: |
|---|
| 98 |
self.writer.close() |
|---|
| 99 |
|
|---|
| 100 |
|
|---|
| 101 |
indexer_factory = PluginFactory(LuceneIndexer) |
|---|
| 102 |
|
|---|
| 103 |
|
|---|
| 104 |
class LuceneResult(Result): |
|---|
| 105 |
def __iter__(self): |
|---|
| 106 |
for id, hit in self.context: |
|---|
| 107 |
yield self._translate(hit) |
|---|
| 108 |
|
|---|
| 109 |
def __getitem__(self, index): |
|---|
| 110 |
return self._translate(self.context[index]) |
|---|
| 111 |
|
|---|
| 112 |
def _translate(self, hit): |
|---|
| 113 |
attributes = {} |
|---|
| 114 |
for field in hit.fields(): |
|---|
| 115 |
attributes[field.name().encode('utf-8')] = field.stringValue() |
|---|
| 116 |
attributes['uri'] = URI(attributes['uri']) |
|---|
| 117 |
return Hit(current=self.indexer.framework.fetch, |
|---|
| 118 |
indexed=self.indexer.fetch, **attributes) |
|---|