Changeset 329
- Timestamp:
- 04/23/06 23:57:38 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/file.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/hyperestraier.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (11 diffs)
- pyndexter/trunk/pyndexter/metasource.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/xapian.py (modified) (1 diff)
- pyndexter/trunk/.todo (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/file.py
r322 r329 41 41 def fetch(self, uri): 42 42 path = self._uri2file(uri) 43 stat = os.stat(path) 43 try: 44 stat = os.stat(path) 45 except Exception, e: 46 raise DocumentNotFound(e) 44 47 return Document(uri, source=self, changed=stat.st_mtime, 45 48 content=self._fetch_content, size=stat.st_size, 46 49 created=stat.st_ctime) 50 51 def exists(self, uri): 52 return os.path.exists(self._uri2file(uri)) 47 53 48 54 def _fetch_content(self, uri): … … 59 65 def _uri2file(self, uri): 60 66 scheme, location, path, query, fragment = urlsplit(uri) 61 assert scheme in ('file', '') 67 if scheme not in ('file', ''): 68 raise InvalidURI("URI scheme in '%s' not supported by FileSource" 69 % scheme) 62 70 path = os.path.normpath(path) 63 assert path.startswith(self.root) 71 if not path.startswith(self.root): 72 raise InvalidURI("Requested URI '%s' is not from this FileSource" 73 % uri) 64 74 return path.decode(self.encoding) 65 75 pyndexter/trunk/pyndexter/hyperestraier.py
r322 r329 11 11 self.path = path 12 12 if not os.path.exists(self.path): 13 assert mode == READWRITE, "Index directory has not been initialised" 13 if mode != READWRITE: 14 raise IndexerError("Index directory has not been initialised") 14 15 os.makedirs(self.path) 15 16 self.hype_path = os.path.join(self.path, 'hyperestraier.db') … … 26 27 return self.source.fetch(uri) 27 28 doc = self.db.get_doc_by_uri(uri) 28 attributes = {} 29 for k in doc.attributes: 30 if k[0] == '@': 31 attributes[k[1:]] = doc.get(k) 32 else: 33 attributes[k] = doc.get(k) 29 if doc is None: 30 raise DocumentNotFound(uri) 31 attributes = self._translate_attributes(doc) 34 32 return Document(content=doc.text, source=self.source, **attributes) 35 33 … … 85 83 self.db = None 86 84 85 # Internal methods 86 def _translate_attributes(self, hdoc): 87 attributes = {} 88 for k in hdoc.attributes: 89 if k[0] == '@': 90 attributes[k[1:]] = hdoc.get(k) 91 else: 92 attributes[k] = hdoc.get(k) 93 return attributes 87 94 class HyperestraierSearch(Search): 88 95 def __iter__(self): 89 96 for doc in self.context: 90 97 # How do we get the score? 91 yield Hit(doc['@uri'], document=self.indexer.fetch) 98 yield Hit(document=self.indexer.fetch, 99 **self.indexer._translate_attributes(doc)) 92 100 93 101 def __len__(self): pyndexter/trunk/pyndexter/__init__.py
r322 r329 10 10 11 11 __all__ = """ 12 Error 13 InvalidURI 14 DocumentNotFound 15 InvalidMode 16 InvalidState 17 IndexerError 18 SourceError 19 12 20 REMOVED ADDED MODIFIED 21 13 22 READONLY READWRITE 23 14 24 CAP_READONLY CAP_ORDERING CAP_CONTENT CAP_ATTRIBUTES CAP_RELEVANCE CAP_HITCOUNT 15 25 CAP_LIST CAP_ITERATION 26 16 27 Document Source Indexer Search Hit 17 28 """.split() … … 37 48 38 49 50 class Error(Exception): pass 51 class DocumentNotFound(Error): pass 52 class InvalidURI(Error): pass 53 class SourceError(Error): pass 54 class InvalidState(Error): pass 55 # Indexer errors 56 class IndexerError(Error): pass 57 class InvalidMode(IndexerError): pass 58 59 39 60 class Document(object): 40 61 """ A Document represents an indexable object in pyndexter. All string … … 53 74 def __init__(self, uri, content=None, source=None, changed=None, 54 75 **attributes): 55 self.uri = uri56 76 self._content = content 57 77 self.source = source 58 self.changed = changed 78 self.attributes = attributes 79 self.attributes.update({'uri': uri, 'changed': changed}) 59 80 self.__dict__.update(attributes) 60 81 61 82 def __repr__(self): 62 83 return '<Document "%s">' % self.uri 84 85 def __getattr__(self, key): 86 try: 87 return self.attributes[key] 88 except KeyError, e: 89 raise AttributeError(str(e)) 90 91 def __hash__(self): 92 return hash(self.uri) 93 94 def _set_content(self, content): 95 self._content = content 63 96 64 97 def _get_content(self): … … 66 99 self._content = self._content(self.uri) 67 100 return self._content 68 69 def __hash__(self):70 return hash(self.uri)71 72 def _set_content(self, content):73 self._content = content74 75 101 content = property(lambda self: self._get_content(), 76 102 lambda self, value: self._set_content(value)) 77 78 attributes = property(lambda self: dict(79 [(k, v) for k, v in self.__dict__.iteritems()80 if k[0] != '_' and k != 'source']))81 103 82 104 … … 110 132 """ Fetch a document identified by uri. Ideally the Document object 111 133 returned would not have the content included, but would pass a callable 112 to the Document constructor that can fetch it. """ 113 raise NotImplementedError 134 to the Document constructor that can fetch it. Should raise 135 DocumentNotFound if unable to fetch the document. """ 136 raise NotImplementedError 137 138 def exists(self, uri): 139 """ Does the document exist at `uri`? """ 140 try: 141 self.fetch(uri) 142 return True 143 except DocumentNotFound: 144 return False 114 145 115 146 def state(self): … … 132 163 of ADDED, REMOVED or MODIFIED. """ 133 164 state = StringIO(state) 134 state = pickle.loads(gzip.GzipFile(fileobj=state, mode='rb').read()) 165 try: 166 ungzipped = gzip.GzipFile(fileobj=state, mode='rb').read() 167 state = pickle.loads(ungzipped) 168 except Exception, e: 169 raise InvalidState(e) 135 170 current = set() 136 171 for uri in self: … … 161 196 """ Fetch a document. Try to use the indexers data, but fall back 162 197 on the Source copy, if available. """ 163 assert self.source, "This indexer has no Source object associated " \ 164 "with and as such can not fetch() documents." 198 if not self.source: 199 raise SourceError("This indexer has no Source object associated " 200 "with and as such can not fetch() documents.") 165 201 return self.source.fetch(uri) 166 202 … … 172 208 """ Update the index with the current state of the document source. """ 173 209 self._assert_rw() 174 assert self.source, "Can't perform automatic update without a Source. " 175 assert self.state_path, "Source state path not set, Indexer is " \ 176 "not capable of automatic updates." 210 if not self.source: 211 raise SourceError("Can't perform automatic update without a Source.") 212 if not self.state_path: 213 raise IndexerError("Source state path not set, Indexer is not " 214 "capable of automatic updates.") 177 215 if os.path.exists(self.state_path): 178 216 state = open(self.state_path).read() … … 224 262 # Helper methods 225 263 def _assert_rw(self): 226 assert self.mode == READWRITE, \227 "%s must be in READWRITE mode for this operation" % \228 self.__class__.__name__264 if self.mode != READWRITE: 265 raise InvalidMode("%s must be in READWRITE mode for this " 266 "operation" % self.__class__.__name__) 229 267 230 268 def _sync_source_state(self): … … 264 302 265 303 class Hit(object): 266 """ Wrapper around a search hit. """ 304 """ Wrapper around a search hit. If `document` is a callable, it should 305 be a function that fetches the Document associated with `uri`, which is 306 passed as the only argument. 307 """ 267 308 268 309 def __init__(self, uri, document=None, score=None, **attributes): 269 self.uri = uri270 self.score = score271 310 self._document = document 272 self.__dict__.update(attributes) 311 self.attributes = attributes 312 self.attributes.update({'uri': uri, 'score': score}) 313 314 def __getattr__(self, key): 315 try: 316 return self.attributes[key] 317 except KeyError, e: 318 raise AttributeError(str(e)) 273 319 274 320 def _get_document(self): … … 276 322 self._document = self._document(self.uri) 277 323 return self._document 278 279 280 324 document = property(_get_document) pyndexter/trunk/pyndexter/metasource.py
r322 r329 1 from pyndexter import Source1 from pyndexter import * 2 2 from urlparse import urlsplit 3 3 … … 21 21 return True 22 22 return False 23 24 def fetch(self, uri): 25 for source in self.sources: 26 if source.matches(uri): 27 return source.fetch(uri) 28 raise URINotFound pyndexter/trunk/pyndexter/xapian.py
r327 r329 23 23 self.path = path 24 24 if not os.path.exists(self.path): 25 assert mode == READWRITE, "Index directory has not been initialised" 25 if mode != READWRITE: 26 raise IndexerError("Index directory has not been initialised") 26 27 os.makedirs(self.path) 27 28 self.idx_path = os.path.join(path, 'xapian.db')
