Changeset 376
- Timestamp:
- 02/04/07 23:03:27 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/hyperestraier.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/indexers/lucene.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (7 diffs)
- pyndexter/trunk/pyndexter/sources/file.py (modified) (1 diff)
- pyndexter/trunk/.todo (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/hyperestraier.py
r374 r376 22 22 class HyperestraierIndexer(Indexer): 23 23 """ Pyndexter adapter for the Hyperestraier indexer. """ 24 def __init__(self, framework, path, hype_mode=None , **ignore):24 def __init__(self, framework, path, hype_mode=None): 25 25 Indexer.__init__(self, framework) 26 26 self.hype_mode = hype_mode pyndexter/trunk/pyndexter/indexers/lucene.py
r374 r376 13 13 class LuceneIndexer(Indexer): 14 14 def __init__(self, framework, path): 15 Indexer.__init__(self, framework) 15 16 self.path = path 16 17 self.db_path = os.path.join(path, 'lucene.db') 17 18 self.store_path = os.path.join(path, 'store.db') 18 19 19 self.lucene_store = PyLucene.FSDirectory.getDirectory(db_path, True) 20 create = not os.path.exists(self.db_path) and framework.mode == READWRITE 21 self.lucene_store = PyLucene.FSDirectory.getDirectory(self.db_path, create) 22 self.analyzer = PyLucene.StandardAnalyzer() 20 23 21 24 if framework.mode == READWRITE: 22 analyser = PyLucene.StandardAnalyzer() 23 self.writer = PyLucene.IndexWriter(self.lucene_store, analyser, True) 24 #self.writer.setMaxFieldLength(1048576) # ?? 25 26 self.reader = PyLucene.IndexReader.open(self.path) 27 self.searcher = PyLucene.IndexSearcher(self.lucene_store) 25 self.writer = PyLucene.IndexWriter(self.lucene_store, self.analyzer, create) 26 self.writer.setMaxFieldLength(1048576) # ?? 27 else: 28 self.writer = None 28 29 29 30 def index(self, document): 30 31 doc = PyLucene.Document() 31 32 for k, v in document.attributes.iteritems(): 32 doc.add(PyLucene.Field( k, v, PyLucene.Field.Store.YES,33 doc.add(PyLucene.Field(str(k), str(v), PyLucene.Field.Store.YES, 33 34 PyLucene.Field.Index.TOKENIZED)) 34 reader = PyLucene.StringReader(doc .content)35 doc.add( Field('content', reader))35 reader = PyLucene.StringReader(document.content) 36 doc.add(PyLucene.Field('content', reader)) 36 37 self.writer.addDocument(doc) 38 39 def discard(self, uri): 40 reader = PyLucene.IndexReader.open(self.db_path) 41 reader.deleteDocuments(PyLucene.Term('uri', uri)) 42 reader.close() 37 43 38 44 def search(self, query): 39 45 query = query.to_boolean() 40 query = PyLucene.QueryParser.parse(query, 'title', analyzer) 41 raise query 46 searcher = PyLucene.IndexSearcher(self.lucene_store) 47 query = PyLucene.QueryParser('content', self.analyzer).parse(query) 48 #sort_field = PyLucene.SortField('RELEVANCE', False) 49 #sort = PyLucene.Sort(sort_field) 42 50 43 def discard(self): 44 pass 51 # TODO This is causing a segfault?!?! 52 #sort = PyLucene.Sort.INDEXORDER 53 #search = searcher.search(query, sort) 54 search = searcher.search(query) 55 return LuceneResult(self, search) 56 57 def optimise(self): 58 self.writer.optimize() 59 60 def sync(self): 61 try: 62 # XXX Assume this will make it into the Lucene bindings 63 self.writer.flush() 64 except AttributeError: 65 pass 45 66 46 67 def close(self): 47 self.db.close() 48 49 def sync(self): 50 self.db.synchronize() 68 if self.writer: 69 self.writer.close() 51 70 52 71 def state_store(self): … … 54 73 55 74 75 indexer_factory = PluginFactory(LuceneIndexer) 76 77 56 78 class LuceneResult(Result): 57 pass 79 def __iter__(self): 80 for id, hit in self.context: 81 yield self._translate(hit) 82 83 def __getitem__(self, index): 84 return self._translate(self.context[index]) 85 86 def _translate(self, hit): 87 attributes = {} 88 for field in hit.fields(): 89 attributes[field.name().encode('utf-8')] = field.stringValue() 90 return Hit(**attributes) pyndexter/trunk/pyndexter/__init__.py
r374 r376 61 61 READONLY READWRITE 62 62 63 Query Framework Document Source Indexer Result Hit StateStorePluginFactory63 Query Framework Document Source Indexer Result StateStore Hit PluginFactory 64 64 """.split() 65 65 … … 138 138 139 139 def __repr__(self): 140 return '<Document "%s">' % self.uri 140 return '<%s %s>' % (self.__class__.__name__, 141 ' '.join(['%s=%s' % (k, repr(v)) for k, v in 142 self.attributes.iteritems()])) 141 143 142 144 def __getattr__(self, key): … … 146 148 raise AttributeError(unicode(e)) 147 149 150 def __contains__(self, key): 151 return key in self.attributes 152 148 153 def __hash__(self): 149 154 return hash(self.uri) 155 156 def get(self, key, default=None): 157 return self.attributes.get(key, default) 150 158 151 159 def _set_content(self, content): … … 536 544 class PluginFactory(object): 537 545 """Factory for translating URL-style query parameters into a standard 538 module constructor call. pyndexter modules always546 module constructor call. 539 547 540 548 >>> class C: … … 746 754 747 755 class Result(object): 748 """ Represents the result of a search. Each hit is returned as a Hit749 object. """756 """Represents the result of a search. Each hit is returned as a Hit 757 object.""" 750 758 751 759 def __init__(self, indexer, context): … … 754 762 755 763 def __iter__(self): 756 """ Return an iterator over the result set, returning a Hit object for757 each matching document."""764 """Return an iterator over the result set, returning a Hit object 765 for each matching document.""" 758 766 raise NotImplementedError 759 767 … … 763 771 764 772 def __getitem__(self, index): 765 """ Return a Hit object for a specific index in the search result. Not766 necessarily implemented by all Indexers."""773 """Return a Hit object for a specific index in the search result. 774 Not necessarily implemented by all Indexers.""" 767 775 raise NotImplementedError 768 776 pyndexter/trunk/pyndexter/sources/file.py
r375 r376 101 101 102 102 source_factory = PluginFactory(FileSource, 103 include=PluginFactory.List(str),104 exclude=PluginFactory.List(str))103 include=PluginFactory.List(str), 104 exclude=PluginFactory.List(str)) pyndexter/trunk/.todo
r374 r376 5 5 <note priority="medium" time="1145722536"> 6 6 Callbacks for index() and discard(), perhaps something similar for Source objects? 7 <comment> 8 Framework.update() accepts a filter callback. This could be sufficient. 9 </comment> 7 10 </note> 8 <note priority="medium" time="1145802778" >11 <note priority="medium" time="1145802778" done="1170655322"> 9 12 Finish PyLucene adapter 13 <comment> 14 Functional enough for a first commit. 15 </comment> 10 16 </note> 11 17 <note priority="medium" time="1145854608" done="1146296772"> … … 63 69 Use metakit for pure-Python implementation? (Check out "divmod pyndex" for ideas) 64 70 </note> 71 <note priority="medium" time="1170604364"> 72 Deprecate Hit and just use Document - they're almost identical in functionality. 73 </note> 74 <note priority="medium" time="1170651530"> 75 Add generalised "field" indexing. 76 </note> 77 <note priority="medium" time="1170653876"> 78 Search result ordering. 79 </note> 80 <note priority="high" time="1170654664"> 81 How do we detect when sources have been removed from the index? If file:///tmp changes to file:///usr, the Framework has no real way of detecting which URI's in the index are no longer valid. 82 </note> 65 83 </todo>
