root/pyndexter/trunk/pyndexter/indexers/_lucene.py

Revision 452, 3.5 kB (checked in by athomas, 1 year ago)

pyndexter: All modules are now prefixed with _ to avoid import collisions. Updated unit tests.

Line 
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4 #
5 # This software is licensed as described in the file COPYING, which
6 # you should have received as part of this distribution.
7 #
8
9 """
10 Lucene
11 ------
12
13 The Lucene adapter relies on PyLucene_, which is a Swig interface to a gcj
14 compiled version of Java Lucene.
15
16 PyLucene is good, but there are some serious compatibility issues with Python
17 threading due to Java threading wanting to be the only implementation running.
18
19 Usage
20 ~~~~~
21
22 ::
23
24     lucene://<path>
25
26 Installation
27 ~~~~~~~~~~~~
28
29 PyLucene_ is quite difficult to install. Either use your distributions
30 packaging system or, if you're brave, attempt a source installation. Beyond the
31 scope of this hint.
32
33 .. _PyLucene: http://pylucene.osafoundation.org/
34
35 """
36
37 import os
38 import PyLucene
39 from pyndexter import *
40
41 class LuceneIndexer(Indexer):
42     def __init__(self, framework, path):
43         Indexer.__init__(self, framework)
44         self.path = path
45         self.db_path = os.path.join(path, 'lucene.db')
46         self.state_path = os.path.join(path, 'store.db')
47
48         create = not os.path.exists(self.db_path) and framework.mode == READWRITE
49         self.lucene_store = PyLucene.FSDirectory.getDirectory(self.db_path, create)
50         self.analyzer = PyLucene.StandardAnalyzer()
51
52         if framework.mode == READWRITE:
53             self.writer = PyLucene.IndexWriter(self.lucene_store, self.analyzer, create)
54             self.writer.setMaxFieldLength(1048576) # ??
55         else:
56             self.writer = None
57
58     def index(self, document):
59         doc = PyLucene.Document()
60         for k, v in document.attributes.iteritems():
61             doc.add(PyLucene.Field(unicode(k), unicode(v),
62                                    PyLucene.Field.Store.YES,
63                                    PyLucene.Field.Index.TOKENIZED))
64         reader = PyLucene.StringReader(document.content)
65         doc.add(PyLucene.Field('content', reader))
66         self.writer.addDocument(doc)
67
68     def discard(self, uri):
69         reader = PyLucene.IndexReader.open(self.db_path)
70         reader.deleteDocuments(PyLucene.Term('uri', unicode(uri)))
71         reader.close()
72
73     def search(self, query):
74         lq = query.as_string()
75         searcher = PyLucene.IndexSearcher(self.lucene_store)
76         lq = PyLucene.QueryParser('content', self.analyzer).parse(lq)
77         #sort_field = PyLucene.SortField('RELEVANCE', False)
78         #sort = PyLucene.Sort(sort_field)
79
80         # TODO This is causing a segfault?!?!
81         #sort = PyLucene.Sort.INDEXORDER
82         #search = searcher.search(query, sort)
83         search = searcher.search(lq)
84         return LuceneResult(self, query, search)
85
86     def optimise(self):
87         self.writer.optimize()
88
89     def flush(self):
90         try:
91             # XXX Assume this will make it into the Lucene bindings
92             self.writer.flush()
93         except AttributeError:
94             pass
95
96     def close(self):
97         if self.writer:
98             self.writer.close()
99
100
101 indexer_factory = PluginFactory(LuceneIndexer)
102
103
104 class LuceneResult(Result):
105     def __iter__(self):
106         for id, hit in self.context:
107             yield self._translate(hit)
108
109     def __getitem__(self, index):
110         return self._translate(self.context[index])
111
112     def _translate(self, hit):
113         attributes = {}
114         for field in hit.fields():
115             attributes[field.name().encode('utf-8')] = field.stringValue()
116         attributes['uri'] = URI(attributes['uri'])
117         return Hit(current=self.indexer.framework.fetch,
118                    indexed=self.indexer.fetch, **attributes)
Note: See TracBrowser for help on using the browser.