Changeset 449
- Timestamp:
- 08/15/07 09:17:38 (1 year ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (11 diffs)
- pyndexter/trunk/pyndexter/sources (deleted)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r399 r449 162 162 self.config.setdefault('wordid', 0) 163 163 else: 164 self._words = self._wids = lambda w: set( w)165 self._word = self._wid = lambda w: w164 self._words = self._wids = lambda w: set(map(unicode, w)) 165 self._word = self._wid = lambda w: unicode(w) 166 166 167 167 def index(self, document): … … 274 274 out = set() 275 275 for word in words: 276 out.add(self._wid( word))276 out.add(self._wid(unicode(word))) 277 277 return out 278 278 … … 286 286 def _wid(self, word): 287 287 """Return, or allocate, a unique word identifier.""" 288 word = unicode(word) 288 289 try: 289 290 return self.wordid[word] pyndexter/trunk/pyndexter/__init__.py
r397 r449 12 12 indexing engines. It is similar in purpose to the Python DB API. 13 13 14 The main class users will be dealing with is Framework. This class 15 ties indexers and sources of documents together and provides a mechanism 16 for performing automatic updates. 14 The main class users will be dealing with is Framework. This presents 15 a convenient interface to the backend indexers. 17 16 18 17 An example of indexing all .txt files underneath ``/usr/share/doc``: … … 21 20 22 21 import os 23 from pyndexter import Framework 22 from pyndexter import Framework, Document 24 23 25 24 framework = Framework('hyperestraier:///tmp/hyperestraier.idx') 26 framework.add_source('file:///usr/share/doc?include=*.txt') 27 28 framework.update() 25 26 path = '/usr/share/doc' 27 28 for file in [path + f for f in os.listdir(path) if f.endswith('.txt')]: 29 doc = Document(file, open(file).read()) 30 framework.index(doc) 29 31 30 32 # Find all documents with Linus and Torvalds in them … … 64 66 READONLY READWRITE 65 67 66 Query Framework Document Source Indexer Result StateStoreHit PluginFactory URI68 Query Framework Document Indexer Result Hit PluginFactory URI 67 69 Excerpt 68 70 """.split() … … 174 176 content = property(lambda self: self._get_content(), 175 177 lambda self, value: self._set_content(value)) 176 177 178 class Source(object):179 """ A source of indexable documents. A Source object is responsible for not180 only fetching documents and iterating over them, but for determining what181 has changed in the source.182 183 Determing what has changed is achieved with the state() and difference()184 methods. The `state` of a source is the minimum information required to185 be able to determine what has changed. For FileSource this is a list of all186 files and their modification times, for a SubversionSource it would be as187 simple as the changeset number. By default, ``marshal()`` and188 ``difference()`` assume that ``_state`` will contain a dictionary of189 uri:modification-time mappings.190 191 All URI's passed to and from Source objects must be `URI` objects.192 193 (All attributes, including document contents and URI's must be in unicode)194 """195 196 def __init__(self, framework, include=None, exclude=None, predicate=None):197 if include is None and exclude is None:198 include = ['*']199 exclude = []200 elif include is None:201 include = []202 elif exclude is None:203 exclude = []204 self.framework = framework205 self.include = include206 self.exclude = exclude207 self.predicate = predicate or self._glob_predicate208 self._state = {}209 210 def matches(self, uri):211 """ Does this source handle documents matching the given URI? (This212 method is primarily used by the MetaSource class) """213 raise NotImplementedError214 215 def __hash__(self):216 """ The hash must uniquely identify the source. (This method is217 primarily used by the MetaSource class) """218 raise NotImplementedError219 220 def __iter__(self):221 """ Iterate over all *valid* URI's in this source. """222 raise NotImplementedError223 224 def fetch(self, uri):225 """ Fetch a document identified by uri. Ideally the Document object226 returned would not have the content included, but would pass a callable227 to the Document constructor that can fetch it. Should raise228 DocumentNotFound if unable to fetch the document. """229 raise NotImplementedError230 231 def exists(self, uri):232 """ Does the document exist at `uri`? """233 try:234 self.fetch(uri)235 return True236 except DocumentNotFound:237 return False238 239 def marshal(self, file):240 """ Store the state of the `Source` to `file`. Used during an241 `update()`. """242 state = pickle.dumps(self._state, 2)243 gzip.GzipFile(filename='pyndexter source state', fileobj=file,244 mode='wb', compresslevel=1).write(state)245 246 def difference(self, file):247 """ Return an iterable of tuples representing the differences between248 the current state of the `Source` and that in the provided state. Each249 tuple is in the form `(<transition>, uri)`, where <transition> is one250 of `ADDED`, `REMOVED` or `MODIFIED` and uri is a URI object."""251 current = set()252 try:253 ungzipped = gzip.GzipFile(fileobj=file, mode='rb').read()254 state = pickle.loads(ungzipped)255 except Exception, e:256 raise InvalidState('Invalid state provided to document source. '257 'Exception was %s: %s' % (e.__class__.__name__, e))258 for uri in self:259 uuri = unicode(uri)260 current.add(uuri)261 if uuri not in state:262 yield (ADDED, uri)263 elif self.fetch(uri).changed != state[uuri]:264 yield (MODIFIED, uri)265 for removed in set(state.keys()).difference(current):266 yield (REMOVED, URI(removed))267 268 # Useful helper methods269 def _glob_predicate(self, uri):270 """ Given a list of include and exclude pattern lists, return whether271 the given uri matches. """272 uri = unicode(uri)273 from fnmatch import fnmatch274 for pattern in self.exclude:275 if fnmatch(uri, pattern):276 return False277 for pattern in self.include:278 if fnmatch(uri, pattern):279 return True280 return False281 178 282 179 … … 601 498 return out 602 499 return u' '.join(out) 603 604 605 class StateStore(object):606 """A class providing file-like objects for storage and retrieval of607 framework state."""608 609 def __init__(self, path):610 self.path = path611 612 def store(self):613 """Return a file-like object for storing state."""614 return open(self.path, 'wb')615 616 def retrieve(self):617 """Return a file-like object for fetching state."""618 return open(self.path, 'rb')619 620 def exists(self):621 """Does the state store exist?"""622 return os.path.exists(self.path)623 500 624 501 … … 804 681 desired, a `StateStore` object should be passed to the `Framework`.""" 805 682 806 def __init__(self, indexer=None, mode=READWRITE, state_store=None,807 reduce=None,stemmer=None):683 def __init__(self, indexer=None, mode=READWRITE, reduce=None, 684 stemmer=None): 808 685 """`indexer` is a URI used to construct an indexer, or an `Indexer` 809 686 object. … … 825 702 self.reduce = reduce 826 703 827 self.state_store = state_store828 704 self.indexer = indexer 829 830 from pyndexter.sources.metasource import MetaSource831 self.source = MetaSource(self)832 705 833 706 def set_indexer(self, indexer): … … 840 713 self._indexer = indexer 841 714 842 if self.state_store is None:843 self.state_store = self.indexer.state_store()844 845 715 def get_indexer(self): 846 716 return self._indexer … … 848 718 indexer = property(get_indexer, set_indexer) 849 719 850 def add_source(self, source):851 """ Add a source to be indexed to the framework. Can either be a852 `Source` instance or a URI."""853 if isinstance(source, (basestring, URI)):854 Source = self._load_plugin('source', source)855 source = Source(framework=self, uri=source)856 self.source.add_source(source)857 858 720 def fetch(self, uri): 859 721 """ Fetch a document. """ 860 uri = URI(uri) 861 return self.source.fetch(uri) 722 return self.indexer.fetch(URI(uri)) 862 723 863 724 def __iter__(self): 864 """ Iterate over all URI's in the document source. """865 for uri in self. source:725 """ Iterate over all URI's in the indexer. """ 726 for uri in self.indexer: 866 727 yield uri 867 728 868 def update(self, filter=None, context=None): 869 """ Update the index with the current state of the document source. 870 871 `filter` is a callable in the form `(framework, context, stream)`, 872 where `stream` is an iterable of `(transition, uri)` pairs.""" 729 def index(self, document): 730 """Index a single document, specified as a Document object.""" 873 731 self._assert_rw() 874 if not self.state_store: 875 raise IndexerError("Indexer not capable of storing source state, " 876 "and store not provided to Framework - not " 877 "capable of automatic updates.") 878 if not filter: 879 def filter(framework, context, stream): 880 for transition, uri in stream: 881 yield transition, uri 882 883 if self.state_store.exists(): 884 store = self.state_store.retrieve() 885 for transition, uri in filter(self, context, 886 self.source.difference(store)): 887 if transition == REMOVED: 888 self.discard(uri) 889 elif transition == MODIFIED: 890 self.replace(uri) 891 else: 892 self.index(uri) 893 else: 894 def fake_difference(): 895 for uri in self.source: 896 yield ADDED, uri 897 898 for transition, uri in filter(self, context, fake_difference()): 899 self.index(uri) 900 self.flush() 901 902 def index(self, document): 903 """Index a single document, specified as either a Document object or a 904 URI.""" 905 self._assert_rw() 906 if isinstance(document, (URI, basestring)): 907 document = self.fetch(document) 732 assert isinstance(document, Document) 908 733 return self.indexer.index(document) 909 734 … … 917 742 918 743 def replace(self, document): 919 """Replace document in the index, specified as either a Document object 920 or a URI.""" 744 """Replace document in the index, specified as a Document object.""" 921 745 self._assert_rw() 922 if isinstance(document, (URI, basestring)): 923 document = self.fetch(document) 746 assert isinstance(document, Document) 924 747 return self.indexer.replace(document) 925 748 … … 948 771 """Flush indexer state to disk.""" 949 772 if self.mode == READWRITE: 950 if self.mode == READWRITE and self.state_store:951 store = self.state_store.store()952 self.source.marshal(store)953 773 self.indexer.flush() 954 774
