Changeset 387
- Timestamp:
- 02/12/07 02:48:24 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/indexers/hype.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/indexers/hyperestraier.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/lucene.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/lupy.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/pyndex.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers/pyrex.pyx (deleted)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (8 diffs)
- pyndexter/trunk/pyndexter/sources/file.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/util.py (modified) (1 diff)
- pyndexter/trunk/.todo (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r386 r387 158 158 def index(self, document): 159 159 160 uri = self._wid(document.uri)160 uri = unicode(self._wid(document.uri)) 161 161 words = self._wids(self.framework.reduce(document.content)) 162 162 doc_set = set([uri]) … … 189 189 self.uris.replace(uri, words) 190 190 191 replace = index 192 191 193 def discard(self, uri): 194 uri = unicode(uri) 192 195 try: 193 196 del self.attributes[uri] … … 200 203 self.uris.remove(uri) 201 204 202 replace = index203 204 205 def __iter__(self): 205 206 for uri in self.uris.keys(): 206 yield self._word(uri)207 yield URI(self._word(uri)) 207 208 208 209 def fetch(self, uri): 210 uri = unicode(uri) 209 211 uriid = self._wid(uri) 210 212 attributes = self.attributes.get(uriid, {}) … … 292 294 framework = indexer.framework 293 295 attributes = indexer.attributes.get(uri, {}) 294 attributes['uri'] = uri296 attributes['uri'] = URI(uri) 295 297 attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()]) 296 298 return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes) pyndexter/trunk/pyndexter/indexers/hype.py
r381 r387 39 39 40 40 def index(self, document): 41 hdoc = hype.Document( document.uri)41 hdoc = hype.Document(unicode(document.uri)) 42 42 for k, v in document.attributes.iteritems(): 43 43 if k != 'uri': … … 48 48 49 49 def discard(self, uri): 50 doc = self.db.get_doc_by_uri(u ri)50 doc = self.db.get_doc_by_uri(unicode(uri)) 51 51 if not doc: 52 52 raise DocumentNotFound(uri) … … 62 62 63 63 def fetch(self, uri): 64 doc = self.db.get_doc_by_uri(u ri)64 doc = self.db.get_doc_by_uri(unicode(uri)) 65 65 if not doc: 66 66 raise DocumentNotFound(uri) … … 83 83 else: 84 84 attributes[k] = hdoc.get(k) 85 attributes['uri'] = URI(attributes['uri']) 85 86 return attributes 86 87 pyndexter/trunk/pyndexter/indexers/hyperestraier.py
r381 r387 53 53 54 54 def discard(self, uri): 55 u ri = uri.encode('utf-8')56 id = self.db.uri_to_id(u ri)55 uuri = unicode(uri).encode('utf-8') 56 id = self.db.uri_to_id(uuri) 57 57 if id == -1: 58 58 raise DocumentNotFound(uri) … … 60 60 61 61 def fetch(self, uri): 62 u ri = uri.encode('utf-8')63 id = self.db.uri_to_id(u ri)62 uuri = unicode(uri).encode('utf-8') 63 id = self.db.uri_to_id(uuri) 64 64 if id == -1: 65 65 raise DocumentNotFound(uri) … … 101 101 else: 102 102 attributes[k] = hdoc.attr(k).decode('utf-8') 103 attributes['uri'] = URI(attributes['uri']) 103 104 return attributes 104 105 pyndexter/trunk/pyndexter/indexers/lucene.py
r381 r387 31 31 doc = PyLucene.Document() 32 32 for k, v in document.attributes.iteritems(): 33 doc.add(PyLucene.Field(str(k), str(v), PyLucene.Field.Store.YES, 33 doc.add(PyLucene.Field(unicode(k), unicode(v), 34 PyLucene.Field.Store.YES, 34 35 PyLucene.Field.Index.TOKENIZED)) 35 36 reader = PyLucene.StringReader(document.content) … … 39 40 def discard(self, uri): 40 41 reader = PyLucene.IndexReader.open(self.db_path) 41 reader.deleteDocuments(PyLucene.Term('uri', u ri))42 reader.deleteDocuments(PyLucene.Term('uri', unicode(uri))) 42 43 reader.close() 43 44 … … 85 86 for field in hit.fields(): 86 87 attributes[field.name().encode('utf-8')] = field.stringValue() 88 attributes['uri'] = URI(attributes['uri']) 87 89 return Hit(current=self.indexer.framework.fetch, 88 90 indexed=self.indexer.fetch, **attributes) pyndexter/trunk/pyndexter/indexers/lupy.py
r380 r387 33 33 34 34 def index(self, document): 35 attributes = dict([('_' + k.encode('utf-8'), str(v))35 attributes = dict([('_' + k.encode('utf-8'), unicode(v)) 36 36 for k, v in document.attributes.iteritems() 37 37 if v is not None]) … … 40 40 41 41 def discard(self, uri): 42 self.db.delete(uri=u ri)42 self.db.delete(uri=unicode(uri)) 43 43 44 44 def search(self, query): … … 87 87 fields = dict([(str(k), doc.get(k)) for k in doc.fieldNames]) 88 88 fields['score'] = self.context.score(index) 89 fields['uri'] = URI(fields['uri']) 89 90 return Hit(current=self.indexer.framework.fetch, 90 91 indexed=self.indexer.fetch, **fields) pyndexter/trunk/pyndexter/indexers/pyndex.py
r381 r387 29 29 30 30 def index(self, document): 31 self.db.index(document.uri.encode('utf-8'), document.content.encode('utf-8')) 31 uri = unicode(document.uri).encode('utf-8') 32 self.db.index(uri, document.content.encode('utf-8')) 32 33 33 34 def discard(self, uri): 34 35 # FIXME Is there a supported way of deleting documents? This is hackish. 35 36 # FIXME Is there a way of storing attributes? 36 self.db.index(u ri.encode('utf-8'), '')37 self.db.index(unicode(uri).encode('utf-8'), '') 37 38 38 39 def search(self, query): 39 40 # FIXME Should probably do a search on each term, and perform set 40 41 # operations. 41 qs = query.as_string(and_=' ', or_=' ', not_='')42 qs = ' '.join(query.terms()) 42 43 return PyndexResult(self, query, self.db.find(qs)) 43 44 … … 56 57 class PyndexResult(Result): 57 58 def __iter__(self): 58 for uri in self.context: 59 yield Hit(uri=uri.doc.docname, score=uri.score, 60 current=self.indexer.framework.fetch, 61 indexed=self.indexer.fetch) 59 for hit in self.context: 60 yield self._translate(hit) 61 62 def __getitem__(self, index): 63 return self._translate(self.context[index]) 64 65 def _translate(self, hit): 66 return Hit(uri=URI(hit.doc.docname), score=hit.score, 67 current=self.indexer.framework.fetch, 68 indexed=self.indexer.fetch) pyndexter/trunk/pyndexter/indexers/xapian.py
r383 r387 44 44 # FIXME Xapian doesn't support UTF-8 yet. "Coming soon." 45 45 content = document.content.encode('utf-8') 46 uri = document.uri.encode('utf-8')46 uri = unicode(document.uri).encode('utf-8') 47 47 48 48 doc.set_data(content) … … 59 59 60 60 def discard(self, uri): 61 self.db.delete_document('Q' + u ri.encode('utf-8'))61 self.db.delete_document('Q' + unicode(uri).encode('utf-8')) 62 62 63 63 def fetch(self, uri): 64 term = 'Q' + u ri.encode('utf-8')64 term = 'Q' + unicode(uri).encode('utf-8') 65 65 for docid in self.db.postlist(term): 66 66 doc = self.db.get_document(docid[0]) … … 68 68 return Document(uri=uri, content=doc.get_data().decode('utf-8'), 69 69 quality=0.95) 70 raise DocumentNotFound(uri) 70 71 71 72 def __iter__(self): … … 126 127 uri = terms.next()[0][1:] 127 128 assert uri, 'uniQue term (URI) not found in document term list' 128 return Hit( uri,129 return Hit(URI(uri), 129 130 current=self.indexer.framework.fetch, 130 131 indexed=self.indexer.fetch, pyndexter/trunk/pyndexter/__init__.py
r386 r387 238 238 the current state of the `Source` and that in the provided state. Each 239 239 tuple is in the form `(<transition>, uri)`, where <transition> is one 240 of `ADDED`, `REMOVED` or `MODIFIED` ."""240 of `ADDED`, `REMOVED` or `MODIFIED` and uri is a URI object.""" 241 241 current = set() 242 242 try: … … 247 247 'Exception was %s: %s' % (e.__class__.__name__, e)) 248 248 for uri in self: 249 current.add(uri) 250 if uri not in state: 249 uuri = unicode(uri) 250 current.add(uuri) 251 if uuri not in state: 251 252 yield (ADDED, uri) 252 elif self.fetch(uri).changed != state[u ri]:253 elif self.fetch(uri).changed != state[uuri]: 253 254 yield (MODIFIED, uri) 254 255 for removed in set(state.keys()).difference(current): 255 yield (REMOVED, removed)256 yield (REMOVED, URI(removed)) 256 257 257 258 # Useful helper methods … … 261 262 from fnmatch import fnmatch 262 263 for pattern in self.exclude: 263 if fnmatch(u ri, pattern):264 if fnmatch(unicode(uri), pattern): 264 265 return False 265 266 for pattern in self.include: 266 if fnmatch(u ri, pattern):267 if fnmatch(unicode(uri), pattern): 267 268 return True 268 269 return False … … 779 780 if stemmer is None: 780 781 stemmer = lambda word: word 781 elif isinstance(stemmer, basestring):782 elif isinstance(stemmer, (basestring, URI)): 782 783 Stemmer = self._load_plugin('stemmer', stemmer) 783 784 stemmer = Stemmer(uri=stemmer) … … 786 787 self.reduce = reduce 787 788 788 if isinstance(indexer, basestring): 789 self.indexer = self._load_plugin('indexer', indexer) 790 self.indexer = self.indexer(framework=self, uri=indexer) 791 else: 792 self.indexer = indexer 793 794 if state_store is None and indexer: 795 self.state_store = self.indexer.state_store() 796 else: 797 self.state_store = state_store 789 self.state_store = state_store 790 self.indexer = indexer 798 791 799 792 from pyndexter.sources.metasource import MetaSource … … 801 794 802 795 def set_indexer(self, indexer): 803 """Set the `Framework` `Indexer`.""" 804 self.indexer = indexer 796 """Set the `Framework` indexer. Can either be a URI or an `Indexer` 797 object.""" 798 if isinstance(indexer, (basestring, URI)): 799 Indexer = self._load_plugin('indexer', indexer) 800 self._indexer = Indexer(framework=self, uri=indexer) 801 else: 802 self._indexer = indexer 803 805 804 if self.state_store is None: 806 self.state_store = indexer.state_store() 805 self.state_store = self.indexer.state_store() 806 807 def get_indexer(self): 808 return self._indexer 809 810 indexer = property(get_indexer, set_indexer) 807 811 808 812 def add_source(self, source): 809 813 """ Add a source to be indexed to the framework. Can either be a 810 814 `Source` instance or a URI.""" 811 if isinstance(source, basestring):815 if isinstance(source, (basestring, URI)): 812 816 Source = self._load_plugin('source', source) 813 817 source = Source(framework=self, uri=source) … … 816 820 def fetch(self, uri): 817 821 """ Fetch a document. """ 822 uri = URI(uri) 818 823 return self.source.fetch(uri) 819 824 … … 861 866 URI.""" 862 867 self._assert_rw() 863 if isinstance(document, basestring):868 if isinstance(document, (URI, basestring)): 864 869 document = self.fetch(document) 865 870 return self.indexer.index(document) pyndexter/trunk/pyndexter/sources/file.py
r384 r387 46 46 continue 47 47 if S_ISDIR(stat.st_mode): 48 for file in walk_path(os.path.join(path, file)): 49 yield file 50 elif self.predicate(full_path) and os.access(full_path, os.R_OK) \ 48 for uri in walk_path(os.path.join(path, file)): 49 yield uri 50 elif self.predicate(URI(scheme='file', path=full_path)) \ 51 and os.access(full_path, os.R_OK) \ 51 52 and S_ISREG(stat.st_mode): 52 yield ( self._file2uri(full_path).decode(self.encoding), stat)53 yield (URI(scheme='file', path=full_path.decode(self.encoding)), stat) 53 54 54 for file, stat in walk_path('/'):55 self._state[ file] = stat.st_mtime56 yield file55 for uri, stat in walk_path('/'): 56 self._state[unicode(uri)] = stat.st_mtime 57 yield uri 57 58 58 59 def matches(self, uri): 59 scheme, netloc, path, query, fragment = urlsplit(uri, 'file') 60 path = os.path.normpath(unquote(path)) 61 return scheme == 'file' and \ 62 path.startswith(self.path) and \ 63 self.predicate(path) 60 return uri.scheme == 'file' and \ 61 uri.path.startswith(self.path) and \ 62 self.predicate(uri) 64 63 65 64 66 65 def fetch(self, uri): 67 path = self._uri2file(uri)68 66 try: 69 stat = os.stat( path)67 stat = os.stat(uri.path) 70 68 except Exception, e: 71 69 raise DocumentNotFound(uri, e) … … 75 73 76 74 def exists(self, uri): 77 return os.path.exists( self._uri2file(uri))75 return os.path.exists(uri.path) 78 76 79 77 def __hash__(self): 80 return hash( self._file2uri(self.path)+ '-'.join(self.exclude) + \78 return hash('file://' + self.path + '-'.join(self.exclude) + \ 81 79 '+'.join(self.include)) 82 80 83 81 # Internal methods 84 82 def _fetch_content(self, uri): 85 path = self._uri2file(uri) 86 return codecs.open(path, encoding='utf-8', errors='replace').read() 87 88 def _file2uri(self, file): 89 return urlunsplit(('file', '', quote(file), '', '')) 90 91 def _uri2file(self, uri): 92 scheme, location, path, query, fragment = urlsplit(uri, 'file') 93 if scheme != 'file': 94 raise InvalidURI("URI scheme in '%s' not supported by FileSource" 95 % scheme) 96 path = os.path.normpath(unquote(path)) 97 if not path.startswith(self.path): 98 raise InvalidURI("Requested URI '%s' is not from this FileSource" 99 % uri) 100 return path.decode(self.encoding) 83 return codecs.open(uri.path, encoding='utf-8', errors='replace').read() 101 84 102 85 pyndexter/trunk/pyndexter/util.py
r385 r387 140 140 141 141 def __repr__(self): 142 uri = self.scheme and (quote(self.scheme) + '://') or '' 142 return "<URI u'%s'>" % unicode(self) 143 144 def __str__(self): 145 uri = unicode(self.scheme and (quote(self.scheme) + u'://') or u'') 143 146 if self.username or self.password: 144 147 if self.username: 145 148 uri += quote(self.username) 146 149 if self.password: 147 uri += ':' + quote(self.password)148 uri += '@'150 uri += u':' + quote(self.password) 151 uri += u'@' 149 152 uri += quote(self.host) 150 153 if self.port: 151 uri += ':%s' % port154 uri += u':%s' % port 152 155 uri += quote(self.path) 153 156 if self.query: 154 uri += '?' + '&'.join(['&'.join(['%s=%s' % (k, quote(str(v))) for v in l]) 155 for k, l in sorted(self.query.items())]) 157 uri += u'?' + u'&'.join([u'&'.join([u'%s=%s' % (k, quote(str(v))) 158 for v in l]) 159 for k, l in sorted(self.query.items())]) 156 160 if self.fragment: 157 uri += '#' + quote(self.fragment)161 uri += u'#' + quote(self.fragment) 158 162 return uri 159 163 pyndexter/trunk/.todo
r384 r387 125 125 <note priority="medium" time="1171055477"> 126 126 Write a decent test suite. 127 <note priority="medium" time="1171271157"> 128 Test that searches return the right hits. Don't care about order. 129 </note> 130 <note priority="medium" time="1171271356"> 131 Test that all interfaces pass and receive unicode correctly. 132 </note> 133 <note priority="medium" time="1171271371"> 134 Test that all indexers and sources pass URI objects correctly. 135 </note> 127 136 </note> 128 137 </todo>
