Changeset 449

Show
Ignore:
Timestamp:
08/15/07 09:17:38 (1 year ago)
Author:
athomas
Message:

pyndexter: Moving to a cleaner API - removed source and state code.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pyndexter/trunk/pyndexter/indexers/builtin.py

    r399 r449  
    162162            self.config.setdefault('wordid', 0) 
    163163        else: 
    164             self._words = self._wids = lambda w: set(w
    165             self._word = self._wid = lambda w: w 
     164            self._words = self._wids = lambda w: set(map(unicode, w)
     165            self._word = self._wid = lambda w: unicode(w) 
    166166 
    167167    def index(self, document): 
     
    274274        out = set() 
    275275        for word in words: 
    276             out.add(self._wid(word)) 
     276            out.add(self._wid(unicode(word))) 
    277277        return out 
    278278 
     
    286286    def _wid(self, word): 
    287287        """Return, or allocate, a unique word identifier.""" 
     288        word = unicode(word) 
    288289        try: 
    289290            return self.wordid[word] 
  • pyndexter/trunk/pyndexter/__init__.py

    r397 r449  
    1212indexing engines. It is similar in purpose to the Python DB API. 
    1313 
    14 The main class users will be dealing with is Framework. This class 
    15 ties indexers and sources of documents together and provides a mechanism 
    16 for performing automatic updates. 
     14The main class users will be dealing with is Framework. This presents 
     15a convenient interface to the backend indexers. 
    1716 
    1817An example of indexing all .txt files underneath ``/usr/share/doc``: 
     
    2120 
    2221    import os 
    23     from pyndexter import Framework 
     22    from pyndexter import Framework, Document 
    2423 
    2524    framework = Framework('hyperestraier:///tmp/hyperestraier.idx') 
    26     framework.add_source('file:///usr/share/doc?include=*.txt') 
    27  
    28     framework.update() 
     25 
     26    path = '/usr/share/doc' 
     27 
     28    for file in [path + f for f in os.listdir(path) if f.endswith('.txt')]: 
     29        doc = Document(file, open(file).read()) 
     30        framework.index(doc) 
    2931 
    3032    # Find all documents with Linus and Torvalds in them 
     
    6466READONLY READWRITE 
    6567 
    66 Query Framework Document Source Indexer Result StateStore Hit PluginFactory URI 
     68Query Framework Document Indexer Result Hit PluginFactory URI 
    6769Excerpt 
    6870""".split() 
     
    174176    content = property(lambda self: self._get_content(), 
    175177                       lambda self, value: self._set_content(value)) 
    176  
    177  
    178 class Source(object): 
    179     """ A source of indexable documents. A Source object is responsible for not 
    180     only fetching documents and iterating over them, but for determining what 
    181     has changed in the source. 
    182  
    183     Determing what has changed is achieved with the state() and difference() 
    184     methods. The `state` of a source is the minimum information required to 
    185     be able to determine what has changed. For FileSource this is a list of all 
    186     files and their modification times, for a SubversionSource it would be as 
    187     simple as the changeset number. By default, ``marshal()`` and 
    188     ``difference()`` assume that ``_state`` will contain a dictionary of 
    189     uri:modification-time mappings. 
    190  
    191     All URI's passed to and from Source objects must be `URI` objects. 
    192  
    193     (All attributes, including document contents and URI's must be in unicode) 
    194     """ 
    195  
    196     def __init__(self, framework, include=None, exclude=None, predicate=None): 
    197         if include is None and exclude is None: 
    198             include = ['*'] 
    199             exclude = [] 
    200         elif include is None: 
    201             include = [] 
    202         elif exclude is None: 
    203             exclude = [] 
    204         self.framework = framework 
    205         self.include = include 
    206         self.exclude = exclude 
    207         self.predicate = predicate or self._glob_predicate 
    208         self._state = {} 
    209  
    210     def matches(self, uri): 
    211         """ Does this source handle documents matching the given URI? (This 
    212         method is primarily used by the MetaSource class) """ 
    213         raise NotImplementedError 
    214  
    215     def __hash__(self): 
    216         """ The hash must uniquely identify the source. (This method is 
    217         primarily used by the MetaSource class) """ 
    218         raise NotImplementedError 
    219  
    220     def __iter__(self): 
    221         """ Iterate over all *valid* URI's in this source. """ 
    222         raise NotImplementedError 
    223  
    224     def fetch(self, uri): 
    225         """ Fetch a document identified by uri. Ideally the Document object 
    226         returned would not have the content included, but would pass a callable 
    227         to the Document constructor that can fetch it. Should raise 
    228         DocumentNotFound if unable to fetch the document. """ 
    229         raise NotImplementedError 
    230  
    231     def exists(self, uri): 
    232         """ Does the document exist at `uri`? """ 
    233         try: 
    234             self.fetch(uri) 
    235             return True 
    236         except DocumentNotFound: 
    237             return False 
    238  
    239     def marshal(self, file): 
    240         """ Store the state of the `Source` to `file`. Used during an 
    241         `update()`. """ 
    242         state = pickle.dumps(self._state, 2) 
    243         gzip.GzipFile(filename='pyndexter source state', fileobj=file, 
    244                       mode='wb', compresslevel=1).write(state) 
    245  
    246     def difference(self, file): 
    247         """ Return an iterable of tuples representing the differences between 
    248         the current state of the `Source` and that in the provided state. Each 
    249         tuple is in the form `(<transition>, uri)`, where <transition> is one 
    250         of `ADDED`, `REMOVED` or `MODIFIED` and uri is a URI object.""" 
    251         current = set() 
    252         try: 
    253             ungzipped = gzip.GzipFile(fileobj=file, mode='rb').read() 
    254             state = pickle.loads(ungzipped) 
    255         except Exception, e: 
    256             raise InvalidState('Invalid state provided to document source. ' 
    257                                'Exception was %s: %s' % (e.__class__.__name__, e)) 
    258         for uri in self: 
    259             uuri = unicode(uri) 
    260             current.add(uuri) 
    261             if uuri not in state: 
    262                 yield (ADDED, uri) 
    263             elif self.fetch(uri).changed != state[uuri]: 
    264                 yield (MODIFIED, uri) 
    265         for removed in set(state.keys()).difference(current): 
    266             yield (REMOVED, URI(removed)) 
    267  
    268     # Useful helper methods 
    269     def _glob_predicate(self, uri): 
    270         """ Given a list of include and exclude pattern lists, return whether 
    271         the given uri matches. """ 
    272         uri = unicode(uri) 
    273         from fnmatch import fnmatch 
    274         for pattern in self.exclude: 
    275             if fnmatch(uri, pattern): 
    276                 return False 
    277         for pattern in self.include: 
    278             if fnmatch(uri, pattern): 
    279                 return True 
    280         return False 
    281178 
    282179 
     
    601498            return out 
    602499        return u' '.join(out) 
    603  
    604  
    605 class StateStore(object): 
    606     """A class providing file-like objects for storage and retrieval of 
    607     framework state.""" 
    608  
    609     def __init__(self, path): 
    610         self.path = path 
    611  
    612     def store(self): 
    613         """Return a file-like object for storing state.""" 
    614         return open(self.path, 'wb') 
    615  
    616     def retrieve(self): 
    617         """Return a file-like object for fetching state.""" 
    618         return open(self.path, 'rb') 
    619  
    620     def exists(self): 
    621         """Does the state store exist?""" 
    622         return os.path.exists(self.path) 
    623500 
    624501 
     
    804681    desired, a `StateStore` object should be passed to the `Framework`.""" 
    805682 
    806     def __init__(self, indexer=None, mode=READWRITE, state_store=None, 
    807                  reduce=None, stemmer=None): 
     683    def __init__(self, indexer=None, mode=READWRITE, reduce=None, 
     684                 stemmer=None): 
    808685        """`indexer` is a URI used to construct an indexer, or an `Indexer` 
    809686        object. 
     
    825702            self.reduce = reduce 
    826703 
    827         self.state_store = state_store 
    828704        self.indexer = indexer 
    829  
    830         from pyndexter.sources.metasource import MetaSource 
    831         self.source = MetaSource(self) 
    832705 
    833706    def set_indexer(self, indexer): 
     
    840713            self._indexer = indexer 
    841714 
    842         if self.state_store is None: 
    843             self.state_store = self.indexer.state_store() 
    844  
    845715    def get_indexer(self): 
    846716        return self._indexer 
     
    848718    indexer = property(get_indexer, set_indexer) 
    849719 
    850     def add_source(self, source): 
    851         """ Add a source to be indexed to the framework. Can either be a 
    852         `Source` instance or a URI.""" 
    853         if isinstance(source, (basestring, URI)): 
    854             Source = self._load_plugin('source', source) 
    855             source = Source(framework=self, uri=source) 
    856         self.source.add_source(source) 
    857  
    858720    def fetch(self, uri): 
    859721        """ Fetch a document. """ 
    860         uri = URI(uri) 
    861         return self.source.fetch(uri) 
     722        return self.indexer.fetch(URI(uri)) 
    862723 
    863724    def __iter__(self): 
    864         """ Iterate over all URI's in the document source. """ 
    865         for uri in self.source
     725        """ Iterate over all URI's in the indexer. """ 
     726        for uri in self.indexer
    866727            yield uri 
    867728 
    868     def update(self, filter=None, context=None): 
    869         """ Update the index with the current state of the document source. 
    870  
    871         `filter` is a callable in the form `(framework, context, stream)`, 
    872         where `stream` is an iterable of `(transition, uri)` pairs.""" 
     729    def index(self, document): 
     730        """Index a single document, specified as a Document object.""" 
    873731        self._assert_rw() 
    874         if not self.state_store: 
    875             raise IndexerError("Indexer not capable of storing source state, " 
    876                                "and store not provided to Framework - not " 
    877                                "capable of automatic updates.") 
    878         if not filter: 
    879             def filter(framework, context, stream): 
    880                 for transition, uri in stream: 
    881                     yield transition, uri 
    882  
    883         if self.state_store.exists(): 
    884             store = self.state_store.retrieve() 
    885             for transition, uri in filter(self, context, 
    886                                           self.source.difference(store)): 
    887                 if transition == REMOVED: 
    888                     self.discard(uri) 
    889                 elif transition == MODIFIED: 
    890                     self.replace(uri) 
    891                 else: 
    892                     self.index(uri) 
    893         else: 
    894             def fake_difference(): 
    895                 for uri in self.source: 
    896                     yield ADDED, uri 
    897  
    898             for transition, uri in filter(self, context, fake_difference()): 
    899                 self.index(uri) 
    900         self.flush() 
    901  
    902     def index(self, document): 
    903         """Index a single document, specified as either a Document object or a 
    904         URI.""" 
    905         self._assert_rw() 
    906         if isinstance(document, (URI, basestring)): 
    907             document = self.fetch(document) 
     732        assert isinstance(document, Document) 
    908733        return self.indexer.index(document) 
    909734 
     
    917742 
    918743    def replace(self, document): 
    919         """Replace document in the index, specified as either a Document object 
    920         or a URI.""" 
     744        """Replace document in the index, specified as a Document object.""" 
    921745        self._assert_rw() 
    922         if isinstance(document, (URI, basestring)): 
    923             document = self.fetch(document) 
     746        assert isinstance(document, Document) 
    924747        return self.indexer.replace(document) 
    925748 
     
    948771        """Flush indexer state to disk.""" 
    949772        if self.mode == READWRITE: 
    950             if self.mode == READWRITE and self.state_store: 
    951                 store = self.state_store.store() 
    952                 self.source.marshal(store) 
    953773            self.indexer.flush() 
    954774