Changeset 357
- Timestamp:
- 01/04/07 06:52:59 (2 years ago)
- Files:
-
- pyndexter/branches/refactoring (copied) (copied from pyndexter/trunk)
- pyndexter/branches/refactoring/COPYING (copied) (copied from pyndexter/trunk/COPYING)
- pyndexter/branches/refactoring/pyndexter (copied) (copied from pyndexter/trunk/pyndexter)
- pyndexter/branches/refactoring/pyndexter/default.py (copied) (copied from pyndexter/trunk/pyndexter/default.py)
- pyndexter/branches/refactoring/pyndexter/file.py (copied) (copied from pyndexter/trunk/pyndexter/file.py) (2 diffs)
- pyndexter/branches/refactoring/pyndexter/hyperestraier.py (copied) (copied from pyndexter/trunk/pyndexter/hyperestraier.py) (2 diffs)
- pyndexter/branches/refactoring/pyndexter/indexers.py (copied) (copied from pyndexter/trunk/pyndexter/indexers.py)
- pyndexter/branches/refactoring/pyndexter/__init__.py (copied) (copied from pyndexter/trunk/pyndexter/__init__.py) (15 diffs)
- pyndexter/branches/refactoring/pyndexter/lucene.py (copied) (copied from pyndexter/trunk/pyndexter/lucene.py)
- pyndexter/branches/refactoring/pyndexter/metasource.py (copied) (copied from pyndexter/trunk/pyndexter/metasource.py) (1 diff)
- pyndexter/branches/refactoring/pyndexter/portalocker.py (copied) (copied from pyndexter/trunk/pyndexter/portalocker.py)
- pyndexter/branches/refactoring/pyndexter/sources.py (copied) (copied from pyndexter/trunk/pyndexter/sources.py)
- pyndexter/branches/refactoring/pyndexter/util.py (copied) (copied from pyndexter/trunk/pyndexter/util.py) (1 diff)
- pyndexter/branches/refactoring/pyndexter/xapian.py (copied) (copied from pyndexter/trunk/pyndexter/xapian.py) (2 diffs)
- pyndexter/branches/refactoring/setup.py (copied) (copied from pyndexter/trunk/setup.py) (2 diffs)
- pyndexter/branches/refactoring/.todo (copied) (copied from pyndexter/trunk/.todo) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/branches/refactoring/pyndexter/file.py
r354 r357 64 64 return os.path.exists(self._uri2file(uri)) 65 65 66 def _fetch_content(self, uri):67 path = self._uri2file(uri)68 return codecs.open(path, encoding='utf-8', errors='replace').read()69 70 66 def __hash__(self): 71 67 return hash(self._file2uri(self.root) + '-'.join(self.exclude) + \ … … 73 69 74 70 # Internal methods 71 def _fetch_content(self, uri): 72 path = self._uri2file(uri) 73 return codecs.open(path, encoding='utf-8', errors='replace').read() 74 75 75 def _file2uri(self, file): 76 76 return urlunsplit(('file', '', file, '', '')) pyndexter/branches/refactoring/pyndexter/hyperestraier.py
r354 r357 8 8 9 9 import os 10 import hype10 import HyperEstraier 11 11 from pyndexter import * 12 12 13 13 class HyperestraierIndexer(Indexer): 14 """ Pyndexter adapter for the Hyperestraier indexer. """ 14 15 capabilities = CAP_READONLY | CAP_CONTENT | CAP_ATTRIBUTES | CAP_ORDERING |\ 15 16 CAP_HITCOUNT | CAP_LIST | CAP_RELEVANCE | CAP_WHOLEWORD | \ 16 17 CAP_ASTERISK | CAP_INTERSECTION 17 18 18 def __init__(self, path, source=None, mode=READWRITE, hype_mode=None): 19 Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 20 self.path = path 21 self._init_env(self.path) 22 self.hype_path = os.path.join(self.path, 'hyperestraier.db') 23 if hype_mode is None: 24 hype_mode = 0 25 if mode == READONLY: 26 hype_mode |= hype.ESTDBREADER 27 elif mode == READWRITE: 28 hype_mode |= hype.ESTDBWRITER|hype.ESTDBCREAT 29 self.db = hype.Database(self.hype_path, hype_mode) 19 def __init__(self, mode=READWRITE, hype_mode=None): 20 Indexer.__init__(self) 21 self.hype_mode = hype_mode 22 23 def bind(self, framework): 24 Indexer.bind(self, framework) 25 26 self.path = os.path.join(framework.path, 'hyperestraier.db') 27 28 if framework.mode == READWRITE: 29 if not os.path.exists(self.path): 30 os.makedirs(self.path) 31 32 if self.hype_mode is None: 33 self.hype_mode = HyperEstraier.Database.DBREADER 34 if self.framework.mode == READWRITE: 35 self.hype_mode |= HyperEstraier.Database.DBWRITER|HyperEstraier.Database.DBCREAT 36 37 self.db = HyperEstraier.Database() 38 self.db.open(self.path, self.hype_mode) 30 39 31 40 def fetch(self, uri): 32 if self.source: 33 return self.source.fetch(uri) 34 doc = self.db.get_doc_by_uri(uri) 41 id = self.db.uri_to_id(uri) 42 doc = self.db.get_doc(id, 0) 35 43 if doc is None: 36 44 raise DocumentNotFound(uri) 37 45 attributes = self._translate_attributes(doc) 38 return Document(content= doc.text, source=self.source, **attributes)46 return Document(content=''.join(doc.texts()), source=self.framework.source, **attributes) 39 47 40 48 def index(self, document): 41 self._assert_rw() 42 if isinstance(document, basestring): 43 document = self.fetch(document) 44 hdoc = hype.Document(document.uri) 49 hdoc = HyperEstraier.Document() 45 50 for k, v in document.attributes.iteritems(): 46 if k != 'uri':47 hdoc['@' + k] = v48 hdoc.add_text(document.content )49 self.db.put_doc(hdoc )51 hdoc.add_attr(unicode('@' + k).encode('utf-8'), 52 unicode(v).encode('utf-8')) 53 hdoc.add_text(document.content.encode('utf-8')) 54 self.db.put_doc(hdoc, 1) 50 55 51 def discard(self, document): 52 self._assert_rw() 53 if isinstance(document, Document): 54 document = document.uri 55 doc = self.db.get_doc_by_uri(document) 56 def discard(self, uri): 57 doc = self.db.get_doc_by_uri(uri) 56 58 if not doc: 57 raise DocumentNotFound( document)58 self.db. remove(doc)59 raise DocumentNotFound(uri) 60 self.db.out_doc(doc, HyperEstraier.Database.ODCLEAN) 59 61 60 def search(self, phrase, flags=0, order_by=None, 61 order_ascending=True, order_type=str): 62 phrase = ((not flags & SEARCH_UNION) and ' ' or '|').join(phrase.split()) 63 order = None 64 if order_by is not None: 65 if order_type is int: 66 order_type = 'NUM' 67 else: 68 order_type = 'STR' 69 order = u'@%s %s%s' % (order_by, order_type, 70 order_ascending and 'A' or 'D') 71 if not flags & SEARCH_ASTERISK: 72 phrase = phrase.replace('*', '\\*') 73 if not flags & SEARCH_QUESTION: 74 phrase = phrase.replace('?', '\\?') 75 if not flags & SEARCH_WHOLEWORD: 76 phrase = '*' + '* *'.join(phrase.split()) + '*' 77 return self.hype_search(phrase, order=order) 62 def search(self, query): 63 raise NotImplementedError 64 # def search(self, phrase, flags=0, order_by=None, 65 # order_ascending=True, order_type=str): 66 # phrase = ((not flags & SEARCH_UNION) and ' ' or '|').join(phrase.split()) 67 # order = None 68 # if order_by is not None: 69 # if order_type is int: 70 # order_type = 'NUM' 71 # else: 72 # order_type = 'STR' 73 # order = u'@%s %s%s' % (order_by, order_type, 74 # order_ascending and 'A' or 'D') 75 # if not flags & SEARCH_ASTERISK: 76 # phrase = phrase.replace('*', '\\*') 77 # if not flags & SEARCH_QUESTION: 78 # phrase = phrase.replace('?', '\\?') 79 # if not flags & SEARCH_WHOLEWORD: 80 # phrase = '*' + '* *'.join(phrase.split()) + '*' 81 # return self.hype_search(phrase, order=order) 78 82 83 def optimise(self): 84 self.db.optimize() 85 86 def sync(self): 87 self.db.sync() 88 89 def close(self): 90 self.db.close() 91 self.db = None 92 93 # Hyperestraier-specific methods 79 94 def hype_search(self, phrase, simple=True, order=None): 80 95 """ Full Hyperestraier search phrase. """ … … 84 99 return HyperestraierSearch(self, phrase, search) 85 100 86 def optimize(self):87 self._assert_rw()88 self.db.optimize()89 90 def sync(self):91 if self.mode == READWRITE:92 self.db.sync()93 self._sync_source_state()94 95 def close(self):96 if self.mode == READWRITE:97 self.sync()98 self.db = None99 100 101 # Internal methods 101 102 def _translate_attributes(self, hdoc): 102 103 attributes = {} 103 for k in hdoc.attr ibutes:104 for k in hdoc.attr_names(): 104 105 if k[0] == '@': 105 attributes[k[1:]] = hdoc. get(k)106 attributes[k[1:]] = hdoc.attr(k).decode('utf-8') 106 107 else: 107 attributes[k] = hdoc. get(k)108 attributes[k] = hdoc.attr(k).decode('utf-8') 108 109 return attributes 109 110 pyndexter/branches/refactoring/pyndexter/__init__.py
r354 r357 7 7 # 8 8 9 import re 9 10 import os 10 11 import pickle … … 25 26 IndexerError 26 27 SourceError 28 InvalidQuery 27 29 28 30 REMOVED ADDED MODIFIED … … 36 38 SEARCH_WHOLEWORD SEARCH_ASTERISK SEARCH_QUESTION SEARCH_UNION 37 39 38 Document Source Indexer Search Hit40 Query Framework Document Source Indexer Search Hit 39 41 """.split() 40 42 … … 88 90 """ The mode (READONLY or READWRITE) of the indexer is an 89 91 invalid state for a particular operation. """ 92 class InvalidQuery(Error): 93 """ Invalid query string. """ 90 94 91 95 … … 146 150 be able to determine what has changed. For FileSource this is a list of all 147 151 files and their modification times, for a SubversionSource it would be as 148 simple as the changeset number. The default state() and difference() 149 methods use the data in self._state. 152 simple as the changeset number. By default, `marshal()` and 153 `difference()` assume that `_state` will contain a dictionary of 154 uri:modification-time mappings. 150 155 151 156 (All attributes, including document contents and URI's must be in unicode) … … 153 158 154 159 def __init__(self, include=None, exclude=None, predicate=None): 155 self.include = include or ['*'] 156 self.exclude = exclude or [] 160 if include is None: 161 include = ['*'] 162 if exclude is None: 163 exclude = [] 164 self.include = include 165 self.exclude = exclude 157 166 self.predicate = predicate or self._glob_predicate 158 167 self._state = {} … … 164 173 165 174 def __hash__(self): 166 """ The hash must uniquely identify the source. (This 167 method is primarily used by the MetaSource class) """ 168 raise NotImplementedError('The hash of a Source is required by the ' 169 'MetaSource class.') 175 """ The hash must uniquely identify the source. (This method is 176 primarily used by the MetaSource class) """ 177 raise NotImplementedError 170 178 171 179 def __iter__(self): … … 179 187 DocumentNotFound if unable to fetch the document. """ 180 188 raise NotImplementedError 189 190 def bind(self, framework): 191 """ Bind the `Source` to the given framework. """ 181 192 182 193 def exists(self, uri): … … 188 199 return False 189 200 190 def state(self): 191 """ Return a raw byte string representing the current state of this 192 source. Storage and retrieval of this byte string is typically handled 193 by the Indexer. If this method returns false, the Indexer will assume 194 that state information is not available, and do nothing. """ 195 if not self._state: 196 return None 197 return self._marshal_state(self._state) 198 199 def difference(self, state): 201 def marshal(self, file): 202 """ Store the state of the `Source` to `file`. Used during an 203 `update()`. """ 204 state = pickle.dumps(self._state, 2) 205 gzip.GzipFile(filename='pyndexter source state', fileobj=file, 206 mode='wb', compresslevel=1).write(state) 207 208 def difference(self, file): 200 209 """ Return an iterable of tuples representing the differences between 201 the current state of the sourceand that in the provided state. Each210 the current state of the `Source` and that in the provided state. Each 202 211 tuple is in the form `(<transition>, uri)`, where <transition> is one 203 of ADDED, REMOVED or MODIFIED. """212 of `ADDED`, `REMOVED` or `MODIFIED`. """ 204 213 current = set() 205 state = self._unmarshal_state(state) 214 try: 215 ungzipped = gzip.GzipFile(fileobj=file, mode='rb').read() 216 state = pickle.loads(ungzipped) 217 except Exception, e: 218 raise InvalidState('Invalid state provided to document source. ' 219 'Exception was %s: %s' % (e.__class__.__name__, e)) 206 220 for uri in self: 207 221 current.add(uri) … … 226 240 return False 227 241 228 def _marshal_state(self, state): 229 """ Pickle and compress state. This is used by the default state() 230 implementation, but can be reused. """ 231 state = pickle.dumps(state, 2) 232 compressed = StringIO() 233 gzip.GzipFile(filename='pyndexer source state', fileobj=compressed, 234 mode='wb', compresslevel=1).write(state) 235 return compressed.getvalue() 236 237 def _unmarshal_state(self, state): 238 """ Uncompress and unpickle state. Used by the default difference() 239 method, but can be reused. """ 240 state = StringIO(state) 241 try: 242 ungzipped = gzip.GzipFile(fileobj=state, mode='rb').read() 243 return pickle.loads(ungzipped) 244 except Exception, e: 245 raise InvalidState('Invalid state provided to document source. ' 246 'Exception was %s: %s' % (e.__class__.__name__, e)) 242 243 class QueryNode(object): 244 """ A query parse node. """ 245 246 TERM = 0 247 NOT = 1 248 AND = 2 249 OR = 3 250 251 __slots__ = ('type', 'value', 'left', 'right') 252 253 def __init__(self, type, value=None, left=None, right=None): 254 self.type = type 255 self.value = value 256 self.left = left 257 self.right = right 258 259 def __repr__(self): 260 if self.type is None: 261 return '' 262 type_map = ('term', 'not', 'and', 'or') 263 def show(node, depth=0): 264 if node.type == QueryNode.TERM: 265 text = '%s("%s"' % (' ' * depth, node.value) 266 else: 267 text = "%s(%s%s" % (' ' * depth, type_map[node.type], node.value and ' "%s"' % node.value or "") 268 if node.left or node.right: 269 text += "\n" 270 if node.left: 271 text += show(node.left, depth + 1) 272 else: 273 text += "%snil" % (' ' * (depth + 1)) 274 text += "\n" 275 if node.right: 276 text += show(node.right, depth + 1) 277 else: 278 text += "%snil" % (' ' * (depth + 1)) 279 text += ")" 280 return text 281 return show(self) 282 283 284 class Query(QueryNode): 285 """ Query parser. Converts a simple query language into a parse tree which 286 Indexers can then convert into their own implementation-specific 287 representation. 288 289 The query language is in the following form: 290 291 <term> <term> document must contain all of these terms 292 "some term" return documents matching this exact phrase 293 -<term> exclude documents containing this term 294 <term> or <term> return documents matching either term 295 296 eg. 297 298 >>> Query('lettuce tomato -cheese') 299 (and 300 ("lettuce") 301 (and 302 ("tomato") 303 (not 304 ("cheese") 305 nil))) 306 307 >>> Query('"mint slices" -timtams') 308 (and 309 ("mint slices") 310 (not 311 ("timtams") 312 nil)) 313 314 >>> Query('brie cheese or camembert cheese') 315 (and 316 ("brie") 317 (or 318 ("cheese") 319 (and 320 ("camembert") 321 ("cheese")))) 322 """ 323 324 _tokenise = re.compile(r"(?P<ex>-)|(?P<or>or)|\"(?P<dq>(?:\\.|[^\"])*)\"|'(?P<sq>(?:\\.|[^'])*)'|(?P<te>(?:\S)+)", re.I) 325 _group_map = {'dq': QueryNode.TERM, 'sq': QueryNode.TERM, 'te': QueryNode.TERM, 326 'ex': QueryNode.NOT, 'or': QueryNode.OR} 327 328 def __init__(self, query): 329 QueryNode.__init__(self, None) 330 tokens = [(self._group_map[token.lastgroup], token.group(token.lastindex)) 331 for token in self._tokenise.finditer(query)] 332 root = self.parse(tokens) 333 if root: 334 for k in self.__slots__: 335 setattr(self, k, getattr(root, k)) 336 337 def parse(self, tokens): 338 # TODO: add support for sub-expressions eg. "(a b) or c" 339 left = self.parse_unary(tokens) 340 if tokens: 341 if tokens[0][0] == QueryNode.OR: 342 tokens.pop(0) 343 return QueryNode(QueryNode.OR, left=left, right=self.parse(tokens)) 344 else: 345 return QueryNode(QueryNode.AND, left=left, right=self.parse(tokens)) 346 return left 347 348 def parse_unary(self, tokens): 349 if not tokens: 350 return None 351 if tokens[0][0] == QueryNode.NOT: 352 tokens.pop(0) 353 return QueryNode(QueryNode.NOT, left=self.parse_terminal(tokens)) 354 return self.parse_terminal(tokens) 355 356 def parse_terminal(self, tokens): 357 if not tokens: 358 raise InvalidQuery('Unexpected end of string') 359 if tokens[0][0] in (QueryNode.TERM, QueryNode.OR): 360 token = tokens.pop(0) 361 return QueryNode(QueryNode.TERM, value=token[1]) 362 raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) 363 247 364 248 365 class Indexer(object): 366 """ An Indexer performs document indexing and searching. This base object 367 provides a framework for indexers. """ 368 249 369 capabilities = 0 250 370 251 """ An Indexer performs indexing and searching on a document Source. 371 def __init__(self): 372 """ Initialise indexer. """ 373 self.framework = None 374 375 def close(self): 376 """ Close the indexer. The object is subsequently not usable. 377 378 `sync()` is automatically called by the `Framework` prior to `close()`.""" 379 raise NotImplementedError 380 381 def index(self, document): 382 """ Index a single Document object. """ 383 raise NotImplementedError 384 385 def discard(self, uri): 386 """ Discard a document. """ 387 raise NotImplementedError 388 389 def search(self, query): 390 """ Search with the given Query. """ 391 raise NotImplementedError 392 393 def __iter__(self): 394 """ Iterate over all documents in the index. """ 395 raise NotImplementedError 396 397 # Optional methods 398 def bind(self, framework): 399 """ Bind the `Indexer` to the given framework. """ 400 self.framework = framework 401 402 def optimise(self): 403 """ Optimise the indexer. """ 404 405 def sync(self): 406 """ Synchronise indexer with stored representation. """ 252 407 253 `source` is the Source object, if any. 254 `state_path` is the location to store `source` state data. If this is 255 provided, update() and sync() will automatically store and retrieve source 256 state. """ 257 def __init__(self, source=None, mode=READWRITE, state_path=None): 408 def fetch(self, uri): 409 """ Fetch a Document. Note that, depending on the Indexer, the returned 410 content may not be identical to the originall indexed document. """ 411 412 413 class Framework(object): 414 """ The glue. Ties `Indexer` and `Source` together and provides a 415 convenient interface. """ 416 def __init__(self, path, indexer, source=None, mode=READWRITE): 417 self.path = path 418 self.state_path = os.path.join(self.path, 'state.db') 419 self.indexer = indexer 258 420 self.source = source 259 421 self.mode = mode 260 self.state_path = state_path 422 423 if not os.path.exists(self.path): 424 os.makedirs(self.path) 425 426 self.indexer.bind(self) 427 if self.source: 428 self.source.bind(self) 261 429 262 430 def fetch(self, uri): 263 """ Fetch a document. Try to use the indexers data, but fall back264 on the Source copy, if available. """431 """ Fetch a document. Prefer to fetch from `Source` object if 432 available, otherwise fall back to `Indexer`. """ 265 433 if not self.source: 266 raise IndexerError("This indexer has no associated Source object " 267 "and as such can not fetch() documents.") 434 return self.indexer.fetch(uri) 268 435 return self.source.fetch(uri) 269 270 def __iter__(self):271 """ Iterate over all URI's in the index. """272 raise NotImplementedError273 436 274 437 def update(self): … … 281 444 "capable of automatic updates.") 282 445 if os.path.exists(self.state_path): 283 try: 284 state = open(self.state_path).read() 285 except Exception, e: 286 raise IndexerError("Source state '%s' is not readable. " 287 "Exception was %s: %s" % 288 (self.state_path, e.__class__.__name__, 289 unicode(e))) 290 446 state = open(self.state_path) 291 447 for transition, uri in self.source.difference(state): 292 448 if transition == REMOVED: … … 296 452 else: 297 453 for uri in self.source: 454 print uri 298 455 self.index(uri) 299 456 … … 301 458 """ Index a single document, specified as either a Document object 302 459 or a URI. """ 303 raise NotImplementedError 460 self._assert_rw() 461 if isinstance(document, basestring): 462 document = self.fetch(document) 463 return self.indexer.index(document) 464 304 465 305 466 def discard(self, document): 306 467 """ Discard the specified document from the index, specified as either 307 468 a Document object or a URI. """ 308 raise NotImplementedError 309 310 def search(self, phrase, flags=0, order_by=None, 311 order_ascending=True, order_type=str): 312 """ Search the index for documents containing the given terms. If 313 intersection is True, return only documents that match all terms. This 469 self._assert_rw() 470 if isinstance(document, Document): 471 document = document.uri 472 return self.indexer.discard(document) 473 474 475 def search(self, query): 476 """ Search the index for documents matching the given query. This 314 477 method is guaranteed to work across all indexers. 478 479 `query` is a pyndexter compatible search string. 315 480 316 `order` is an optional attribute by which to order results. If prefixed 317 by a `<`, results will be in descending order, `>` for ascending. 318 319 `order_type` is typically either `str` or `int`. 320 321 `flags` is a bitwise or of the `SEARCH_*` flags 322 323 Returns a Search object. """ 324 raise NotImplementedError 481 Returns a `Search` object. """ 482 query = Query(query) 483 return self.indexer.search(query) 325 484 326 485 def close(self): 327 486 """ Sync and close the indexer. The object is subsequently not 328 487 usable. """ 329 raise NotImplementedError330 331 # Default NOP methods 332 def optimi ze(self):488 self.indexer.sync() 489 self.indexer.close() 490 491 def optimise(self): 333 492 """ Optimise the indexer. """ 493 self.indexer.optimise() 334 494 335 495 def sync(self): 336 496 """ Synchronise indexer with on-disk representation. """ 497 if self.mode == READWRITE: 498 self._sync_source_state() 499 self.indexer.sync() 337 500 338 501 # Helper methods … … 346 509 constructor. """ 347 510 if self.mode == READWRITE and self.source and self.state_path: 348 state = self.source.state() 349 if state: 350 open(self.state_path, 'w').write(self.source.state()) 351 352 def _init_env(self, path): 353 """ Create a default environment with a <path> base directory. """ 354 if not os.path.exists(path): 355 if self.mode != READWRITE: 356 raise IndexError("Indexer environment has not been initialised") 357 os.makedirs(path) 511 file = open(self.state_path, 'wb') 512 self.source.marshal(file) 513 file.close() 514 358 515 359 516 class Search(object): … … 409 566 return self._document 410 567 document = property(_get_document) 568 569 570 if __name__ == '__main__': 571 import doctest 572 doctest.testmod() pyndexter/branches/refactoring/pyndexter/metasource.py
r354 r357 47 47 return False 48 48 49 def state(self):49 def marshal(self, file): 50 50 state = {} 51 51 for source in self.sources: 52 state[hash(source)] = source.state() 53 return pickle.dumps(state, 2) 52 stream = StringIO() 53 source.marshal(stream) 54 state[hash(source)] = stream.getvalue() 55 file.write(pickle.dumps(state, 2)) 54 56 55 57 def difference(self, state): pyndexter/branches/refactoring/pyndexter/util.py
r354 r357 54 54 key = self.accessbytime[age] 55 55 del self[key] 56 57 58 def uriparse(uri): 59 """ Parse a URI into its component parts. The query is passed through 60 `cgi.parse_qs()`. (scheme://netloc/path;parameters?query#fragment). PS. 61 `urlparse` is not useful. """ 62 from cgi import parse_qs 63 import re 64 65 global urisplit 66 if not hasattr(urisplit, '_pattern'): 67 urisplit._pattern = re.compile(r'(?P<scheme>[^:]+)://(?P<netloc>[^/]*)(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?') 68 69 match = urisplit._pattern.match(uri) 70 if match is None: 71 raise ValueError('Invalid URI') 72 groups = match.groups() 73 return groups[0:3] + (parse_qs(groups[3] or ''),) + groups[4:] pyndexter/branches/refactoring/pyndexter/xapian.py
r354 r357 18 18 CAP_INTERSECTION 19 19 20 def __init__(self, path, source=None, mode=READWRITE, stemmer='english', 21 words=r'\w+'): 22 Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 23 self.path = path 24 self._init_env(self.path) 25 self.idx_path = os.path.join(path, 'xapian.db') 26 if mode == READWRITE: 27 self.db = xapian.WritableDatabase(self.idx_path, 28 xapian.DB_CREATE_OR_OPEN) 29 else: 30 self.db = xapian.Database(self.idx_path) 20 def __init__(self, stemmer='english', words=r'\w+'): 21 Indexer.__init__(self) 31 22 self.stemmer = xapian.Stem('english') 32 23 self.words = re.compile(words) 33 24 25 def bind(self, framework): 26 Indexer.bind(self, framework) 27 self.path = os.path.join(framework.path, 'xapian.db') 28 if self.framework.mode == READWRITE: 29 if not os.path.exists(self.path): 30 os.makedirs(self.path) 31 self.db = xapian.flint_open(self.path, xapian.DB_CREATE_OR_OPEN) 32 else: 33 self.db = xapian.flint_open(self.path) 34 34 35 35 def index(self, document): 36 self._assert_rw()37 if isinstance(document, basestring):38 document = self.fetch(document)39 40 36 doc = xapian.Document() 41 37 … … 54 50 self.db.replace_document('Q' + uri, doc) 55 51 56 def discard(self, document): 57 self._assert_rw() 58 if isinstance(document, Document): 59 document = document.uri 60 self.db.delete_document('Q' + document.encode('utf-8')) 52 def discard(self, uri): 53 self.db.delete_document('Q' + uri.encode('utf-8')) 61 54 62 55 def sync(self): 63 if self.mode == READWRITE: 64 self._assert_rw() 65 self.db.flush() 66 self._sync_source_state() 56 self.db.flush() 67 57 68 58 def close(self): 69 59 self.sync() 60 self.db.close() 70 61 self.db = None 71 62 pyndexter/branches/refactoring/setup.py
r354 r357 14 14 author='Alec Thomas', 15 15 author_email='alec@swapoff.org', 16 version='0. 1',16 version='0.2', 17 17 classifiers=['Development Status :: 3 - Alpha', 18 18 'Environment :: Plugins', … … 21 21 'Operating System :: OS Independent', 22 22 'Topic :: Software Development :: Libraries'], 23 extras_require={'hype': ['hype>=0.1'], 24 'Xapwrap': ['Xapwrap>=0.3']}, 23 extras_require={'hype': ['hype>=0.1']}, 25 24 ext_modules=[Extension('pyndexter.pyrex', ['pyndexter/pyrex.pyx'])], 26 25 packages=['pyndexter']) pyndexter/branches/refactoring/.todo
r354 r357 27 27 For storing state, perhaps there should be default store_state(store)/restore_state(store) methods. Also need a Store class, or just use a file object... 28 28 </note> 29 <note priority="high" time="1159197046"> 30 Refactor Indexer into two classes: the Indexer itself, and a class that glues Source and the Indexer together. This would remove the duplication I'm getting in all the stock methods (update, index, fetch, etc.) 31 </note> 29 32 </todo>
