Changeset 364
- Timestamp:
- 01/15/07 09:53:36 (2 years ago)
- Files:
-
- pyndexter/branches/refactoring/pyndexter/indexers/default.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/indexers/hype.py (added)
- pyndexter/branches/refactoring/pyndexter/indexers/hyperestraier.py (modified) (5 diffs)
- pyndexter/branches/refactoring/pyndexter/indexers/__init__.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/indexers/lupy.py (modified) (2 diffs)
- pyndexter/branches/refactoring/pyndexter/indexers/xapian.py (modified) (5 diffs)
- pyndexter/branches/refactoring/pyndexter/__init__.py (modified) (12 diffs)
- pyndexter/branches/refactoring/pyndexter/sources/file.py (modified) (5 diffs)
- pyndexter/branches/refactoring/pyndexter/sources/metasource.py (modified) (3 diffs)
- pyndexter/branches/refactoring/pyndexter/util.py (modified) (3 diffs)
- pyndexter/branches/refactoring/.todo (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/branches/refactoring/pyndexter/indexers/default.py
r362 r364 96 96 """ Default indexer, using bigrams. """ 97 97 98 capabilities = CAP_READONLY | CAP_HITCOUNT | CAP_UNION | \99 CAP_INTERSECTION | CAP_ITERATION | CAP_LIST | \100 CAP_ATTRIBUTES | CAP_WHOLEWORD101 102 98 _tokeniser = re.compile(r'\w+') 103 99 pyndexter/branches/refactoring/pyndexter/indexers/hyperestraier.py
r361 r364 7 7 # 8 8 9 """ 10 Adapter for Hyperestraier using the swigged bindings 11 (http://hyperestraier.sourceforge.net/) 12 """ 13 9 14 import os 10 15 import HyperEstraier … … 12 17 13 18 14 __all__ = ['create_indexer', 'HyperestraierIndexer', 'HyperestraierSearch'] 15 16 17 def indexer(framework, **args): 18 """ Create HyperEstraier Pyndexter adapter. """ 19 from pyndexter.util import cast_args 20 args = cast_args(args, {'hype_mode': int}) 21 return HyperEstraierIndexer(framework, **args) 19 __all__ = ['HyperestraierIndexer', 'HyperestraierResult'] 22 20 23 21 24 22 class HyperestraierIndexer(Indexer): 25 23 """ Pyndexter adapter for the Hyperestraier indexer. """ 26 capabilities = CAP_READONLY | CAP_CONTENT | CAP_ATTRIBUTES | CAP_ORDERING |\ 27 CAP_HITCOUNT | CAP_LIST | CAP_RELEVANCE | CAP_WHOLEWORD | \ 28 CAP_ASTERISK | CAP_INTERSECTION 29 30 def __init__(self, framework, hype_mode=None, **ignore): 24 def __init__(self, framework, path, hype_mode=None, **ignore): 31 25 Indexer.__init__(self, framework) 32 26 self.hype_mode = hype_mode 33 27 34 self.path = os.path.join(framework.path, 'hyperestraier.db').encode('utf-8') 28 self.path = path 29 self.db_path = os.path.join(path, 'hyperestraier.db').encode('utf-8') 30 self.state_path = os.path.join(path, 'state.db') 35 31 36 32 if framework.mode == READWRITE: … … 69 65 raise DocumentNotFound(uri) 70 66 self.db.out_doc(id, HyperEstraier.Database.ODCLEAN) 67 68 def state_store(self): 69 return StateStore(self.state_path) 71 70 72 71 def search(self, query): … … 110 109 # if order is not None: 111 110 # search = search.order(order) 112 return Hyperestraier Search(self, phrase, search)111 return HyperestraierResult(self, phrase, search) 113 112 114 113 # Internal methods … … 139 138 140 139 141 class HyperestraierSearch(Search): 140 indexer_factory = ComponentFactory(HyperestraierIndexer, hype_mode=int) 141 142 143 class HyperestraierResult(Result): 142 144 def __iter__(self): 143 145 for id in self.context: pyndexter/branches/refactoring/pyndexter/indexers/__init__.py
r361 r364 7 7 # 8 8 9 __all__ = ['hyperestraier', ' xapian', 'lucene', 'lupy', 'default', 'pyrex']9 __all__ = ['hyperestraier', 'hype', 'xapian', 'lucene', 'lupy', 'default', 'pyrex'] pyndexter/branches/refactoring/pyndexter/indexers/lupy.py
r362 r364 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 9 """ 10 Adapter for the (deprecated, but still available from 11 http://gentoo.prz.rzeszow.pl/distfiles/Lupy-0.2.1.tar.gz) Lupy indexer. 12 """ 13 1 14 import os 2 15 from pyndexter import * 3 16 lupy = __import__('lupy', {}, {}, ['']) 4 17 lupy.indexer = __import__('lupy.indexer', {}, {}, ['']) 18 lupy.search = __import__('lupy.search', {}, {}, ['']) 5 19 6 20 class LupyIndexer(Indexer): 7 """ Adapter for the (deprecated, but still available from 8 http://gentoo.prz.rzeszow.pl/distfiles/Lupy-0.2.1.tar.gz) Lupy indexer. """ 9 10 def bind(self, framework): 11 self.path = os.path.join(framework.path, 'lupy.db').encode('utf-8') 12 13 self.db = lupy.indexer.Index(self.path, create=framework.mode == READWRITE and not os.path.exists(self.path)) 21 def __init__(self, framework, path): 22 Indexer.__init__(self, framework) 23 self.path = path 24 self.db_path = os.path.join(self.path, 'lupy.db').encode('utf-8') 25 self.state_path = os.path.join(self.path, 'state.db') 26 if framework.mode == READWRITE and not os.path.exists(self.path): 27 os.makedirs(self.path) 28 self.db = lupy.indexer.Index(self.db_path, 29 create=framework.mode == \ 30 READWRITE and not os.path.exists(self.db_path)) 14 31 15 32 … … 25 42 26 43 def search(self, query): 27 ands, ors, nots = [], [], [] 28 self._compile_query(query, ands, ors, nots) 44 lupy_query = lupy.indexer.BooleanQuery() 45 self._compile_query(query, (True, False), lupy_query) 46 searcher = lupy.search.indexsearcher.IndexSearcher(self.db_path) 47 hits = searcher.search(lupy_query) 48 return LupyResult(self, query, hits) 29 49 30 50 def optimise(self): 31 51 self.db.optimize() 32 52 53 def close(self): 54 self.db.close() 55 56 def state_store(self): 57 return StateStore(self.state_path) 58 33 59 # Internal methods 34 def _compile_query(self, node, ands, ors, nots):60 def _compile_query(self, node, op, query): 35 61 if not node or node.type == node.NULL: 36 return ''62 return 37 63 if node.type == node.AND: 38 self._compile_query(node.left) 64 self._compile_query(node.left, (True, False), query) 65 self._compile_query(node.right, (True, False), query) 66 elif node.type == node.OR: 67 self._compile_query(node.left, (False, False), query) 68 self._compile_query(node.right, (False, False), query) 69 elif node.type == node.NOT: 70 self._compile_query(node.left, (False, True), query) 71 elif node.type == node.TERM: 72 query.add(lupy.indexer.TermQuery(lupy.indexer.Term('text', node.value)), *op) 73 else: 74 raise NotImplementedError 39 75 40 76 41 class LupySearch(Search): 77 indexer_factory = ComponentFactory(LupyIndexer) 78 79 class LupyResult(Result): 42 80 def __iter__(self): 43 pass 81 for hit in self.context: 82 fields = dict([(str(k), hit.get(k)) for k in hit.fieldNames]) 83 yield Hit(document=self.indexer.framework.fetch, 84 **fields) 85 86 pyndexter/branches/refactoring/pyndexter/indexers/xapian.py
r363 r364 7 7 # 8 8 9 """ 10 Adapter for Xapian (http://www.xapian.org) 11 """ 12 9 13 import os 10 14 import re … … 12 16 xapian = __import__('xapian') 13 17 14 def create_indexer(framework, path=None, stemmer='english', words=r'\w+', 15 **ignore): 16 return XapianIndexer(framework, path=path, stemmer=stemmer, words=words) 18 19 __all__ = ['XapianIndexer', 'XapianResult'] 20 17 21 18 22 class XapianIndexer(Indexer): … … 70 74 enquire = xapian.Enquire(self.db) 71 75 enquire.set_query(query) 72 return Xapian Search(self, query, enquire)76 return XapianResult(self, query, enquire) 73 77 74 78 def state_store(self): … … 80 84 if node.type == node.AND: 81 85 return '%s AND %s' % (self._compile_query(node.left), 82 self._compile_query(node.right))86 self._compile_query(node.right)) 83 87 elif node.type == node.OR: 84 88 return '%s OR %s' % (self._compile_query(node.left), … … 92 96 93 97 94 class XapianSearch(Search): 98 indexer_factory = ComponentFactory(XapianIndexer) 99 100 101 class XapianResult(Result): 95 102 def __iter__(self): 96 matches = self.context.get_mset(0, 10)103 matches = self.context.get_mset(0, 20) 97 104 for hit in matches: 98 105 doc = hit[xapian.MSET_DOCUMENT] pyndexter/branches/refactoring/pyndexter/__init__.py
r362 r364 11 11 import pickle 12 12 import gzip 13 import inspect 13 14 from StringIO import StringIO 14 15 from urlparse import urlsplit, urlunsplit … … 32 33 READONLY READWRITE 33 34 34 CAP_READONLY CAP_ORDERING CAP_CONTENT CAP_ATTRIBUTES CAP_RELEVANCE CAP_HITCOUNT 35 CAP_LIST CAP_ITERATION CAP_ASTERISK CAP_QUESTION CAP_WHOLEWORD CAP_UNION 36 CAP_INTERSECTION 37 38 SEARCH_WHOLEWORD SEARCH_ASTERISK SEARCH_QUESTION SEARCH_UNION 39 40 Query Framework Document Source Indexer Search Hit 35 Query Framework Document Source Indexer Result Hit StateStore ComponentFactory 41 36 """.split() 42 37 … … 50 45 READWRITE = 1 51 46 52 # Indexer capabilities53 CAP_READONLY = 1 # Supports read-only access to the index54 CAP_ORDERING = 2 # Supports result ordering55 CAP_CONTENT = 4 # Can fetch() document content56 CAP_ATTRIBUTES = 8 # Supports per-document attributes57 CAP_RELEVANCE = 16 # Can return results by relevance58 CAP_HITCOUNT = 32 # Search result supports len()59 CAP_LIST = 64 # Search result supports list-style lookup60 CAP_ITERATION = 128 # Supports index iteration61 CAP_ASTERISK = 256 # Supports the asterisk wildcard (*<term>*)62 CAP_QUESTION = 512 # Supports the single character wildcard (a?c)63 CAP_WHOLEWORD = 512 # Performs whole word searches by default64 CAP_UNION = 1024 # Supports unions (ie. matches documents with any word)65 CAP_INTERSECTION = 2048 # Supports intersections (ie. matches documents with66 # all words)67 68 # Search flags. Flags may be ignored if the indexer does not support a69 # particular feature.70 SEARCH_WHOLEWORD = 1 # Perform a wholeword search71 SEARCH_ASTERISK = 2 # Allow wildcard (*) in search term72 SEARCH_QUESTION = 4 # Allow single character wildcard (?) in search term73 SEARCH_UNION = 8 # Whether to perform a union rather than an74 # intersection (the default) of search term results75 47 76 48 class Error(Exception): … … 361 333 362 334 335 class StateStore(object): 336 """A class providing file-like objects for storage and retrieval of 337 framework state.""" 338 339 def __init__(self, path): 340 self.path = path 341 342 def store(self): 343 """Return a file-like object for storing state.""" 344 return open(self.path, 'wb') 345 346 def retrieve(self): 347 """Return a file-like object for fetching state.""" 348 return open(self.path, 'rb') 349 350 def exists(self): 351 """Does the state store exist?""" 352 return os.path.exists(self.path) 353 354 363 355 class Indexer(object): 364 """ An Indexer performs document indexing and searching. This base object 365 provides a framework for indexers. """ 366 367 capabilities = 0 356 """An Indexer performs document indexing and searching. This base object 357 provides a framework for indexers.""" 368 358 369 359 def __init__(self, framework): … … 395 385 raise NotImplementedError 396 386 387 def update(self, document): 388 """Update a document in the index. Default is to `discard()` and 389 `index()`.""" 390 self.discard(document.uri) 391 self.index(document) 392 397 393 def optimise(self): 398 394 """ Optimise the indexer. """ … … 401 397 """ Synchronise indexer with stored representation. """ 402 398 399 def state_store(self): 400 """If this Indexer is capable of storing framework state, return a 401 `StateStore` object.""" 402 return None 403 404 405 class ComponentFactory(object): 406 """Factory for translating URL-style query parameters into a standard 407 constructor call.""" 408 409 class List(object): 410 """Translate a parameter that is a list of elements of `type`.""" 411 def __init__(self, type): 412 self.type = type 413 414 def __call__(self, value): 415 return [self.type(v) for v in value] 416 417 def __init__(self, indexer, **arg_types): 418 """Create a new factory. 419 420 arg_types is a dictionary of <arg>:<type> mappings.""" 421 422 self.indexer = indexer 423 self.arg_types = arg_types 424 args, varargs, self.varkw, defaults = \ 425 inspect.getargspec(self.indexer.__init__) 426 defaults = defaults or [] 427 self.defaults = dict(zip(list(args[-len(defaults):]), defaults)) 428 self.defaults.pop('self', None) 429 self.args = defaults and args[:-len(defaults)] or args 430 431 def __call__(self, framework, **kwargs): 432 args = dict(self.defaults.items()) 433 args['framework'] = framework 434 args.update(kwargs) 435 436 # Translate all remaining arguments 437 for k, v in args.items(): 438 if v is not None and k in self.arg_types: 439 type = self.arg_types[k] 440 # If it's a list, and not marked as such, convert it to a scalar 441 if isinstance(v, (tuple, list)) and not isinstance(type, self.List): 442 if len(v) != 1: 443 raise ValueError('argument "%s" should be a scalar' % k) 444 v = v[0] 445 args[k] = type(v) 446 447 return self.indexer(**args) 448 403 449 404 450 class Framework(object): 405 """ The glue. Ties `Indexer` and `Source` together, performs housekeeping 406 tasks and provides a convenient interface to it all. """ 407 def __init__(self, path, indexer, sources=[], mode=READWRITE, 408 indexer_args={}): 409 self.path = path 410 self.state_path = os.path.join(self.path, 'state.db') 451 """The glue. Ties `Indexer` and `Source` together, performs housekeeping 452 tasks and provides a convenient interface to it all. 453 454 If the `Indexer` is not capable of storing state and automatic updates are 455 desired, a `StateStore` object should be passed to the `Framework`.""" 456 457 def __init__(self, indexer, sources=[], mode=READWRITE, 458 indexer_args={}, state_store=None): 411 459 self.mode = mode 412 460 413 if not os.path.exists(self.path):414 os.makedirs(self.path)415 416 461 self.indexer = self._load_plugin('indexer', indexer, indexer_args) 417 462 463 if state_store is None: 464 self.state_store = self.indexer.state_store() 465 else: 466 self.state_store = state_store 467 418 468 sources = [self._load_plugin('source', source) for source in sources] 469 419 470 from pyndexter.sources.metasource import MetaSource 420 471 self.source = MetaSource(self) … … 423 474 424 475 def add_source(self, source, source_args={}): 425 """ Add a source to be indexed to the framework. """ 476 """ Add a source to be indexed to the framework. Can either be a 477 `Source` instance or a URI.""" 426 478 if isinstance(source, basestring): 427 479 source = self._load_plugin('source', source, source_args) … … 430 482 def fetch(self, uri): 431 483 """ Fetch a document. """ 432 if not self.source:433 raise SourceError("Can't fetch documents without a document source")434 484 return self.source.fetch(uri) 435 485 436 486 def __iter__(self): 437 487 """ Iterate over all URI's in the document source. """ 438 if not self.source:439 raise SourceError("Can't iterate over URI's without a document source")440 488 for uri in self.source: 441 489 yield uri … … 444 492 """ Update the index with the current state of the document source. """ 445 493 self._assert_rw() 446 if not self.source: 447 raise IndexerError("Can't perform automatic update without a Source.") 448 if not self.state_path: 449 raise IndexerError("Source state path not set, Indexer is not " 450 "capable of automatic updates.") 451 if os.path.exists(self.state_path): 452 state = open(self.state_path) 453 for transition, uri in self.source.difference(state): 494 if not self.state_store: 495 raise IndexerError("Source state storage path not defined, " 496 "Framework is not capable of automatic updates.") 497 if self.state_store.exists(): 498 store = self.state_store.retrieve() 499 for transition, uri in self.source.difference(store): 454 500 if transition == REMOVED: 455 501 self.discard(uri) … … 501 547 """ Synchronise indexer with on-disk representation. """ 502 548 if self.mode == READWRITE: 503 self._sync_source_state() 549 if self.mode == READWRITE and self.state_store: 550 store = self.state_store.store() 551 self.source.marshal(store) 504 552 self.indexer.sync() 505 553 … … 507 555 def _load_plugin(self, type, uri, args={}): 508 556 from pyndexter.util import uri_parse 509 scheme, username, password, netloc, path, parameters, query, fragment = \ 557 # Extract URI components 558 scheme, username, password, netloc, path, query, fragment = \ 510 559 uri_parse(uri) 511 query.update(indexer_args) 512 module = __import__('pyndexter.%ss.%s' % (type, scheme), {}, {}, ['']) 513 return getattr(module, type)(self, username=username, password=password, 514 netloc=netloc, path=path, 515 parameters=parameters, fragment=fragment, 516 **query) 560 username = username or None 561 password = password or None 562 uri_components = {'username': username, 'password': password, 563 'netloc': netloc, 'path': path, 'fragment': fragment} 564 # Discard them if they're empty 565 uri_components = dict([(k, v) for k, v in uri_components.iteritems() if v]) 566 query.update(uri_components) 567 query.update(args) 568 module = __import__('pyndexter.%ss.%s' % (type, scheme), 569 {}, {}, ['']) 570 indexer_factory = getattr(module, type + '_factory') 571 assert isinstance(indexer_factory, ComponentFactory) 572 return indexer_factory(self, **query) 517 573 518 574 def _assert_rw(self): … … 521 577 "operation" % self.__class__.__name__) 522 578 523 def _sync_source_state(self): 524 """ Save Source objects state to the location defined in the 525 constructor. """ 526 if self.mode == READWRITE and self.source and self.state_path: 527 file = open(self.state_path, 'wb') 528 self.source.marshal(file) 529 file.close() 530 531 532 class Search(object): 579 580 class Result(object): 533 581 """ Represents the result of a search. Each hit is returned as a Hit 534 582 object. """ pyndexter/branches/refactoring/pyndexter/sources/file.py
r362 r364 12 12 from stat import * 13 13 from urlparse import urlsplit, urlunsplit 14 from pyndexter import * 14 15 15 from pyndexter import Source, Document, DocumentNotFound16 16 17 17 class FileSource(Source): 18 def __init__(self, framework, root, include=None, exclude=None, predicate=None):18 def __init__(self, framework, path, include=None, exclude=None, predicate=None): 19 19 """ Expose a subset of the file system for searching. """ 20 20 Source.__init__(self, framework, include, exclude, predicate) 21 self. root = os.path.normpath(root)21 self.path = os.path.normpath(path) 22 22 self.encoding = sys.getfilesystemencoding() 23 23 … … 25 25 def walk_path(path): 26 26 path = path.strip(os.path.sep) 27 root_path = os.path.join(self. root, path)27 root_path = os.path.join(self.path, path) 28 28 for file in os.listdir(root_path): 29 29 full_path = os.path.join(root_path, file) … … 47 47 path = os.path.normpath(path) 48 48 return scheme == 'file' and \ 49 path.startswith(self. root) and \49 path.startswith(self.path) and \ 50 50 self.predicate(path) 51 51 … … 65 65 66 66 def __hash__(self): 67 return hash(self._file2uri(self. root) + '-'.join(self.exclude) + \67 return hash(self._file2uri(self.path) + '-'.join(self.exclude) + \ 68 68 '+'.join(self.include)) 69 69 … … 78 78 def _uri2file(self, uri): 79 79 scheme, location, path, query, fragment = urlsplit(uri, 'file') 80 if scheme not in'file':80 if scheme != 'file': 81 81 raise InvalidURI("URI scheme in '%s' not supported by FileSource" 82 82 % scheme) 83 83 path = os.path.normpath(path) 84 if not path.startswith(self. root):84 if not path.startswith(self.path): 85 85 raise InvalidURI("Requested URI '%s' is not from this FileSource" 86 86 % uri) 87 87 return path.decode(self.encoding) 88 89 90 source_factory = ComponentFactory(FileSource, 91 include=ComponentFactory.List(str), 92 exclude=ComponentFactory.List(str)) pyndexter/branches/refactoring/pyndexter/sources/metasource.py
r361 r364 7 7 # 8 8 9 import pickle 10 from StringIO import StringIO 9 11 from pyndexter import * 10 12 from urlparse import urlsplit 11 import pickle12 13 13 14 class MetaSource(Source): … … 56 57 file.write(pickle.dumps(state, 2)) 57 58 58 def difference(self, state):59 def difference(self, file): 59 60 try: 60 state = pickle.loads( state)61 state = pickle.loads(file.read()) 61 62 except Exception, e: 62 63 raise InvalidState('Invalid state provided to MetaSource. ' … … 67 68 yield (ADDED, uri) 68 69 else: 69 for change in source.difference(state[hash(source)]): 70 pseudo_file = StringIO(state[hash(source)]) 71 for change in source.difference(pseudo_file): 70 72 yield change pyndexter/branches/refactoring/pyndexter/util.py
r361 r364 13 13 except: 14 14 from sets import Set as set 15 15 16 16 17 class CacheDict(DictMixin): … … 60 61 `cgi.parse_qs()`. 61 62 62 scheme://username:password@netloc/path;parameters?query#fragment 63 scheme://username:password@netloc/path?query#fragment 64 65 TODO: Support "parameters???" Never seen this: 66 scheme://username:password@netloc/path;parameters?query#fragment 63 67 64 68 PS. `urlparse` is not useful. """ … … 75 79 groups = match.groups() 76 80 return groups[0:5] + (parse_qs(groups[5] or ''),) + groups[6:] 77 78 79 def cast_args(args, types):80 """ Cast a set of arguments to the types represented in types. types is a81 dictionary of argument names and their associated type. """82 cast = {}83 nop = lambda o: o84 for k, v in args.iteritems():85 if k in strip:86 continue87 if v is not None:88 v = types.get(k, nop)(v)89 cast[k] = v90 return castpyndexter/branches/refactoring/.todo
r357 r364 4 4 </title> 5 5 <note priority="medium" time="1145722536"> 6 Callbacks for index() and discard(), perhaps something similar for Source objects 6 Callbacks for index() and discard(), perhaps something similar for Source objects? 7 7 </note> 8 8 <note priority="medium" time="1145802778"> … … 30 30 Refactor Indexer into two classes: the Indexer itself, and a class that glues Source and the Indexer together. This would remove the duplication I'm getting in all the stock methods (update, index, fetch, etc.) 31 31 </note> 32 <note priority="medium" time="1168868728"> 33 Add slicing on Search objects. This will allow fast pagination in result displays. 34 </note> 35 <note priority="low" time="1168875038"> 36 Add some "stock" query translators (eg. a AND b OR c style, a b or c, +a +b c, etc.) 37 </note> 32 38 </todo>
