Changeset 332
- Timestamp:
- 30/04/06 14:09:58 (4 years ago)
- Location:
- pyndexter/trunk
- Files:
-
- 3 added
- 6 modified
-
.todo (modified) (1 diff)
-
pyndexter/__init__.py (modified) (11 diffs)
-
pyndexter/default.py (added)
-
pyndexter/file.py (modified) (5 diffs)
-
pyndexter/hyperestraier.py (modified) (2 diffs)
-
pyndexter/metasource.py (modified) (4 diffs)
-
pyndexter/portalocker.py (added)
-
pyndexter/util.py (added)
-
pyndexter/xapian.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
pyndexter/trunk/.todo
r329 r332 9 9 Finish PyLucene adapter 10 10 </note> 11 <note priority="medium" time="1145854608" >11 <note priority="medium" time="1145854608" done="1146296772"> 12 12 Finish MetaSource 13 13 </note> 14 <note priority="medium" time="1146296806"> 15 Optimise on disk format for DefaultIndexer. Use URI/word "ids" rather than full word. 16 </note> 17 <note priority="medium" time="1146321654"> 18 I think it might need a MIME filter system, for translating known content types to plain text for indexing. eg. Just the content of HTML pages. This could get out of hand. 19 </note> 20 <note priority="medium" time="1146328561" done="1146368244"> 21 state() is being called, which in the naive implementation simply walks the entire source. Need some way around this. Should the state() be accumulated somehow when the source is being walked? 22 </note> 23 <note priority="medium" time="1146331225" done="1146368238"> 24 HTTPSource should be able to handle multiple iterations, but self._traversed renders this impossible. 25 </note> 14 26 </todo> -
pyndexter/trunk/pyndexter/__init__.py
r331 r332 23 23 24 24 CAP_READONLY CAP_ORDERING CAP_CONTENT CAP_ATTRIBUTES CAP_RELEVANCE CAP_HITCOUNT 25 CAP_LIST CAP_ITERATION 25 CAP_LIST CAP_ITERATION CAP_ASTERISK CAP_QUESTION CAP_WHOLEWORD CAP_UNION 26 CAP_INTERSECTION 26 27 27 28 Document Source Indexer Search Hit … … 46 47 CAP_LIST = 64 # Search result supports list-style lookup 47 48 CAP_ITERATION = 128 # Supports index iteration 49 CAP_ASTERISK = 256 # Supports the asterisk wildcard (*<term>*) 50 CAP_QUESTION = 512 # Supports the single character wildcard (a?c) 51 CAP_WHOLEWORD = 512 # Performs whole word searches by default 52 CAP_UNION = 1024 # Supports unions (ie. matches documents with any word) 53 CAP_INTERSECTION = 2048 # Supports intersections (ie. matches documents with 54 # all words) 48 55 49 56 … … 97 104 return self.attributes[key] 98 105 except KeyError, e: 99 raise AttributeError( str(e))106 raise AttributeError(unicode(e)) 100 107 101 108 def __hash__(self): … … 116 123 """ A source of indexable documents. A Source object is responsible for not 117 124 only fetching documents and iterating over them, but for determining what 118 has changed in the source. This is achieved with the state() and 119 difference() methods. The ''state'' of a source is the minimum information 120 required to be able to determine what has changed. For FileSource this is a 121 list of all files and their modification times, for a SubversionSource it 122 would be as simple as the changeset number. 125 has changed in the source. 126 127 Determing what has changed is achieved with the state() and difference() 128 methods. The ''state'' of a source is the minimum information required to 129 be able to determine what has changed. For FileSource this is a list of all 130 files and their modification times, for a SubversionSource it would be as 131 simple as the changeset number. The default state() and difference() 132 methods use the data in self._state. 123 133 124 134 (All attributes, including document contents and URI's must be in unicode) 125 135 """ 136 137 def __init__(self, include=['*'], exclude=[], predicate=None): 138 self.include = include 139 self.exclude = exclude 140 self.predicate = predicate or self._glob_predicate 141 self._state = {} 126 142 127 143 def matches(self, uri): … … 158 174 """ Return a raw byte string representing the current state of this 159 175 source. Storage and retrieval of this byte string is typically handled 160 by the Indexer. """ 161 state = {} 162 for uri in self: 163 state[uri] = self.fetch(uri).changed 164 state = pickle.dumps(state, 2) 165 compressed = StringIO() 166 gzip.GzipFile(filename='pyndexer source state', fileobj=compressed, 167 mode='wb', compresslevel=1).write(state) 168 return compressed.getvalue() 176 by the Indexer. If this method returns false, the Indexer will assume 177 that state information is not available, and do nothing. """ 178 if not self._state: 179 return None 180 return self._marshal_state(self._state) 169 181 170 182 def difference(self, state): … … 173 185 tuple is in the form `(<transition>, uri)`, where <transition> is one 174 186 of ADDED, REMOVED or MODIFIED. """ 175 state = StringIO(state)176 try:177 ungzipped = gzip.GzipFile(fileobj=state, mode='rb').read()178 state = pickle.loads(ungzipped)179 except Exception, e:180 raise InvalidState('Invalid state provided to document source. '181 'Exception was %s: %s' % (e.__class__.__name__, e))182 187 current = set() 188 state = self._unmarshal_state(state) 183 189 for uri in self: 184 190 current.add(uri) … … 190 196 yield (REMOVED, removed) 191 197 198 # Useful helper methods 199 def _glob_predicate(self, uri): 200 """ Given a list of include and exclude pattern lists, return whether 201 the given uri matches. """ 202 from fnmatch import fnmatch 203 for pattern in self.exclude: 204 if fnmatch(uri, pattern): 205 return False 206 for pattern in self.include: 207 if fnmatch(uri, pattern): 208 return True 209 return False 210 211 def _marshal_state(self, state): 212 """ Pickle and compress state. This is used by the default state() 213 implementation, but can be reused. """ 214 state = pickle.dumps(state, 2) 215 compressed = StringIO() 216 gzip.GzipFile(filename='pyndexer source state', fileobj=compressed, 217 mode='wb', compresslevel=1).write(state) 218 return compressed.getvalue() 219 220 def _unmarshal_state(self, state): 221 """ Uncompress and unpickle state. Used by the default difference() 222 method, but can be reused. """ 223 state = StringIO(state) 224 try: 225 ungzipped = gzip.GzipFile(fileobj=state, mode='rb').read() 226 return pickle.loads(ungzipped) 227 except Exception, e: 228 raise InvalidState('Invalid state provided to document source. ' 229 'Exception was %s: %s' % (e.__class__.__name__, e)) 192 230 193 231 class Indexer(object): … … 209 247 on the Source copy, if available. """ 210 248 if not self.source: 211 raise IndexerError("This indexer has no Source object associated"212 " withand as such can not fetch() documents.")249 raise IndexerError("This indexer has no associated Source object " 250 "and as such can not fetch() documents.") 213 251 return self.source.fetch(uri) 214 252 … … 226 264 "capable of automatic updates.") 227 265 if os.path.exists(self.state_path): 228 state = open(self.state_path).read() 266 try: 267 state = open(self.state_path).read() 268 except Exception, e: 269 raise IndexerError("Source state '%s' is not readable. " 270 "Exception was %s: %s" % 271 (self.state_path, e.__class__.__name__, 272 unicode(e))) 273 229 274 for transition, uri in self.source.difference(state): 230 275 if transition == REMOVED: … … 282 327 constructor. """ 283 328 if self.mode == READWRITE and self.source and self.state_path: 284 open(self.state_path, 'w').write(self.source.state()) 329 state = self.source.state() 330 if state: 331 open(self.state_path, 'w').write(self.source.state()) 332 333 def _init_env(self, path): 334 """ Create a default environment with a <path> base directory. """ 335 if not os.path.exists(path): 336 if self.mode != READWRITE: 337 raise IndexError("Indexer environment has not been initialised") 338 os.makedirs(path) 285 339 286 340 class Search(object): … … 329 383 return self.attributes[key] 330 384 except KeyError, e: 331 raise AttributeError( str(e))385 raise AttributeError(unicode(e)) 332 386 333 387 def _get_document(self): -
pyndexter/trunk/pyndexter/file.py
r331 r332 1 1 import sys 2 2 import codecs 3 import os.path 4 from fnmatch import fnmatch 5 from dircache import listdir 3 import os 4 from stat import * 6 5 from urlparse import urlsplit, urlunsplit 7 6 … … 11 10 def __init__(self, root, include=['*'], exclude=[], predicate=None): 12 11 """ Expose a subset of the file system for searching. """ 12 Source.__init__(self, include, exclude, predicate) 13 13 self.root = os.path.normpath(root) 14 self.include = include15 self.exclude = exclude16 self.predicate = predicate or self._glob_predicate17 14 self.encoding = sys.getfilesystemencoding() 18 15 … … 21 18 path = path.strip(os.path.sep) 22 19 root_path = os.path.join(self.root, path) 23 for file in listdir(root_path):20 for file in os.listdir(root_path): 24 21 full_path = os.path.join(root_path, file) 25 if os.path.isdir(full_path): 22 try: 23 stat = os.lstat(full_path) 24 except OSError: 25 continue 26 if not self.predicate(full_path) or not os.access(full_path, os.R_OK): 27 continue 28 if S_ISDIR(stat.st_mode): 26 29 for file in walk_path(os.path.join(path, file)): 27 30 yield file 28 elif self.predicate(full_path) and os.path.exists(full_path): 29 # TODO Stat for normal files + readability 30 yield self._file2uri(full_path) 31 elif S_ISREG(stat.st_mode): 32 yield (self._file2uri(full_path).decode(self.encoding), stat) 31 33 32 for file in walk_path('/'): 33 yield file.decode(self.encoding) 34 for file, stat in walk_path('/'): 35 self._state[file] = stat.st_mtime 36 yield file 34 37 35 38 def matches(self, uri): 36 scheme, netloc, path, query, fragment = urlsplit(uri )39 scheme, netloc, path, query, fragment = urlsplit(uri, 'file') 37 40 path = os.path.normpath(path) 38 return scheme in ('file', '')and \41 return scheme == 'file' and \ 39 42 path.startswith(self.root) and \ 40 43 self.predicate(path) … … 67 70 68 71 def _uri2file(self, uri): 69 scheme, location, path, query, fragment = urlsplit(uri )70 if scheme not in ('file', ''):72 scheme, location, path, query, fragment = urlsplit(uri, 'file') 73 if scheme not in 'file': 71 74 raise InvalidURI("URI scheme in '%s' not supported by FileSource" 72 75 % scheme) … … 76 79 % uri) 77 80 return path.decode(self.encoding) 78 79 def _glob_predicate(self, file):80 for pattern in self.exclude:81 if fnmatch(file, pattern):82 return False83 for pattern in self.include:84 if fnmatch(file, pattern):85 return True86 return False87 -
pyndexter/trunk/pyndexter/hyperestraier.py
r331 r332 5 5 class HyperestraierIndexer(Indexer): 6 6 capabilities = CAP_READONLY | CAP_CONTENT | CAP_ATTRIBUTES | CAP_ORDERING |\ 7 CAP_HITCOUNT | CAP_LIST | CAP_RELEVANCE 7 CAP_HITCOUNT | CAP_LIST | CAP_RELEVANCE | CAP_WHOLEWORD | \ 8 CAP_ASTERISK | CAP_INTERSECTION 8 9 9 10 def __init__(self, path, source=None, mode=READWRITE, hype_mode=None): 10 11 Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 11 12 self.path = path 12 if not os.path.exists(self.path): 13 if mode != READWRITE: 14 raise IndexerError("Index directory has not been initialised") 15 os.makedirs(self.path) 13 self._init_env(self.path) 16 14 self.hype_path = os.path.join(self.path, 'hyperestraier.db') 17 15 if hype_mode is None: … … 60 58 else: 61 59 order_type = 'STR' 62 print order_ascending63 60 order = u'@%s %s%s' % (order_by, order_type, 64 61 order_ascending and 'A' or 'D') -
pyndexter/trunk/pyndexter/metasource.py
r331 r332 5 5 class MetaSource(Source): 6 6 """ A collection of sources. If sources serve the same documents the 7 results are undefined, and probably not good."""7 results will be undefined, and probably not good. """ 8 8 def __init__(self, sources=[]): 9 9 self.sources = sources … … 31 31 if source.matches(uri): 32 32 return source.fetch(uri) 33 raise DocumentNotFound 33 raise DocumentNotFound(uri) 34 34 35 35 def exists(self, uri): … … 49 49 state = pickle.loads(state) 50 50 except Exception, e: 51 raise InvalidState('Invalid state provided to document source. '51 raise InvalidState('Invalid state provided to MetaSource. ' 52 52 'Exception was %s: %s' % (e.__class__.__name__, e)) 53 53 for source in self.sources: … … 58 58 for change in source.difference(state[hash(source)]): 59 59 yield change 60 -
pyndexter/trunk/pyndexter/xapian.py
r329 r332 16 16 17 17 class XapianIndexer(Indexer): 18 capabilities = CAP_ORDERING | CAP_READONLY | CAP_ATTRIBUTES | CAP_RELEVANCE | \ 19 CAP_HITCOUNT | CAP_LIST 18 capabilities = CAP_ORDERING | CAP_READONLY | CAP_ATTRIBUTES | \ 19 CAP_RELEVANCE | CAP_HITCOUNT | CAP_LIST | CAP_WHOLEWORD | \ 20 CAP_INTERSECTION 20 21 21 22 def __init__(self, path, source=None, mode=READWRITE): 22 23 Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 23 24 self.path = path 24 if not os.path.exists(self.path): 25 if mode != READWRITE: 26 raise IndexerError("Index directory has not been initialised") 27 os.makedirs(self.path) 25 self._init_env(self.path) 28 26 self.idx_path = os.path.join(path, 'xapian.db') 29 27 if mode == READWRITE: … … 68 66 def search(self, phrase, order_by=None, order_ascending=True, 69 67 order_type=str, intersection=True): 68 phrase = phrase.encode('utf-8') 70 69 if order_by == 'relevance': 71 70 order_args = {'sortByRelevence': True}
