Changeset 376

Show
Ignore:
Timestamp:
02/04/07 23:03:27 (2 years ago)
Author:
athomas
Message:

pyndexter:

  • Lucene adapter is functional.
  • Few docstring updates.
  • Note about issue with clearing out invalidated source URI's.
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pyndexter/trunk/pyndexter/indexers/hyperestraier.py

    r374 r376  
    2222class HyperestraierIndexer(Indexer): 
    2323    """ Pyndexter adapter for the Hyperestraier indexer. """ 
    24     def __init__(self, framework, path, hype_mode=None, **ignore): 
     24    def __init__(self, framework, path, hype_mode=None): 
    2525        Indexer.__init__(self, framework) 
    2626        self.hype_mode = hype_mode 
  • pyndexter/trunk/pyndexter/indexers/lucene.py

    r374 r376  
    1313class LuceneIndexer(Indexer): 
    1414    def __init__(self, framework, path): 
     15        Indexer.__init__(self, framework) 
    1516        self.path = path 
    1617        self.db_path = os.path.join(path, 'lucene.db') 
    1718        self.store_path = os.path.join(path, 'store.db') 
    1819 
    19         self.lucene_store = PyLucene.FSDirectory.getDirectory(db_path, True) 
     20        create = not os.path.exists(self.db_path) and framework.mode == READWRITE 
     21        self.lucene_store = PyLucene.FSDirectory.getDirectory(self.db_path, create) 
     22        self.analyzer = PyLucene.StandardAnalyzer() 
    2023 
    2124        if framework.mode == READWRITE: 
    22             analyser = PyLucene.StandardAnalyzer() 
    23             self.writer = PyLucene.IndexWriter(self.lucene_store, analyser, True) 
    24             #self.writer.setMaxFieldLength(1048576) # ?? 
    25  
    26         self.reader = PyLucene.IndexReader.open(self.path) 
    27         self.searcher = PyLucene.IndexSearcher(self.lucene_store) 
     25            self.writer = PyLucene.IndexWriter(self.lucene_store, self.analyzer, create) 
     26            self.writer.setMaxFieldLength(1048576) # ?? 
     27        else: 
     28            self.writer = None 
    2829 
    2930    def index(self, document): 
    3031        doc = PyLucene.Document() 
    3132        for k, v in document.attributes.iteritems(): 
    32             doc.add(PyLucene.Field(k, v, PyLucene.Field.Store.YES, 
     33            doc.add(PyLucene.Field(str(k), str(v), PyLucene.Field.Store.YES, 
    3334                                   PyLucene.Field.Index.TOKENIZED)) 
    34         reader = PyLucene.StringReader(doc.content) 
    35         doc.add(Field('content', reader)) 
     35        reader = PyLucene.StringReader(document.content) 
     36        doc.add(PyLucene.Field('content', reader)) 
    3637        self.writer.addDocument(doc) 
     38 
     39    def discard(self, uri): 
     40        reader = PyLucene.IndexReader.open(self.db_path) 
     41        reader.deleteDocuments(PyLucene.Term('uri', uri)) 
     42        reader.close() 
    3743 
    3844    def search(self, query): 
    3945        query = query.to_boolean() 
    40         query = PyLucene.QueryParser.parse(query, 'title', analyzer) 
    41         raise query 
     46        searcher = PyLucene.IndexSearcher(self.lucene_store) 
     47        query = PyLucene.QueryParser('content', self.analyzer).parse(query) 
     48        #sort_field = PyLucene.SortField('RELEVANCE', False) 
     49        #sort = PyLucene.Sort(sort_field) 
    4250 
    43     def discard(self): 
    44         pass 
     51        # TODO This is causing a segfault?!?! 
     52        #sort = PyLucene.Sort.INDEXORDER 
     53        #search = searcher.search(query, sort) 
     54        search = searcher.search(query) 
     55        return LuceneResult(self, search) 
     56 
     57    def optimise(self): 
     58        self.writer.optimize() 
     59 
     60    def sync(self): 
     61        try: 
     62            # XXX Assume this will make it into the Lucene bindings 
     63            self.writer.flush() 
     64        except AttributeError: 
     65            pass 
    4566 
    4667    def close(self): 
    47         self.db.close() 
    48  
    49     def sync(self): 
    50         self.db.synchronize() 
     68        if self.writer: 
     69            self.writer.close() 
    5170 
    5271    def state_store(self): 
     
    5473 
    5574 
     75indexer_factory = PluginFactory(LuceneIndexer) 
     76 
     77 
    5678class LuceneResult(Result): 
    57     pass 
     79    def __iter__(self): 
     80        for id, hit in self.context: 
     81            yield self._translate(hit) 
     82 
     83    def __getitem__(self, index): 
     84        return self._translate(self.context[index]) 
     85 
     86    def _translate(self, hit): 
     87        attributes = {} 
     88        for field in hit.fields(): 
     89            attributes[field.name().encode('utf-8')] = field.stringValue() 
     90        return Hit(**attributes) 
  • pyndexter/trunk/pyndexter/__init__.py

    r374 r376  
    6161READONLY READWRITE 
    6262 
    63 Query Framework Document Source Indexer Result Hit StateStore PluginFactory 
     63Query Framework Document Source Indexer Result StateStore Hit PluginFactory 
    6464""".split() 
    6565 
     
    138138 
    139139    def __repr__(self): 
    140         return '<Document "%s">' % self.uri 
     140        return '<%s %s>' % (self.__class__.__name__, 
     141                            ' '.join(['%s=%s' % (k, repr(v)) for k, v in 
     142                                      self.attributes.iteritems()])) 
    141143 
    142144    def __getattr__(self, key): 
     
    146148            raise AttributeError(unicode(e)) 
    147149 
     150    def __contains__(self, key): 
     151        return key in self.attributes 
     152 
    148153    def __hash__(self): 
    149154        return hash(self.uri) 
     155 
     156    def get(self, key, default=None): 
     157        return self.attributes.get(key, default) 
    150158 
    151159    def _set_content(self, content): 
     
    536544class PluginFactory(object): 
    537545    """Factory for translating URL-style query parameters into a standard 
    538     module constructor call. pyndexter modules always 
     546    module constructor call. 
    539547 
    540548    >>> class C: 
     
    746754 
    747755class Result(object): 
    748     """ Represents the result of a search. Each hit is returned as a Hit 
    749     object. """ 
     756    """Represents the result of a search. Each hit is returned as a Hit 
     757    object.""" 
    750758 
    751759    def __init__(self, indexer, context): 
     
    754762 
    755763    def __iter__(self): 
    756         """ Return an iterator over the result set, returning a Hit object for 
    757         each matching document. """ 
     764        """Return an iterator over the result set, returning a Hit object 
     765        for each matching document.""" 
    758766        raise NotImplementedError 
    759767 
     
    763771 
    764772    def __getitem__(self, index): 
    765         """ Return a Hit object for a specific index in the search result. Not 
    766         necessarily implemented by all Indexers. """ 
     773        """Return a Hit object for a specific index in the search result. 
     774        Not necessarily implemented by all Indexers.""" 
    767775        raise NotImplementedError 
    768776 
  • pyndexter/trunk/pyndexter/sources/file.py

    r375 r376  
    101101 
    102102source_factory = PluginFactory(FileSource, 
    103                                   include=PluginFactory.List(str), 
    104                                   exclude=PluginFactory.List(str)) 
     103                               include=PluginFactory.List(str), 
     104                               exclude=PluginFactory.List(str)) 
  • pyndexter/trunk/.todo

    r374 r376  
    55    <note priority="medium" time="1145722536"> 
    66        Callbacks for index() and discard(), perhaps something similar for Source objects? 
     7        <comment> 
     8            Framework.update() accepts a filter callback. This could be sufficient. 
     9        </comment> 
    710    </note> 
    8     <note priority="medium" time="1145802778"
     11    <note priority="medium" time="1145802778" done="1170655322"
    912        Finish PyLucene adapter 
     13        <comment> 
     14            Functional enough for a first commit. 
     15        </comment> 
    1016    </note> 
    1117    <note priority="medium" time="1145854608" done="1146296772"> 
     
    6369        Use metakit for pure-Python implementation? (Check out "divmod pyndex" for ideas) 
    6470    </note> 
     71    <note priority="medium" time="1170604364"> 
     72        Deprecate Hit and just use Document - they're almost identical in functionality. 
     73    </note> 
     74    <note priority="medium" time="1170651530"> 
     75        Add generalised "field" indexing. 
     76    </note> 
     77    <note priority="medium" time="1170653876"> 
     78        Search result ordering. 
     79    </note> 
     80    <note priority="high" time="1170654664"> 
     81        How do we detect when sources have been removed from the index? If file:///tmp changes to file:///usr, the Framework has no real way of detecting which URI's in the index are no longer valid. 
     82    </note> 
    6583</todo>