Changeset 367
- Timestamp:
- 01/18/07 07:27:52 (2 years ago)
- Files:
-
- pyndexter/branches/refactoring/pyndexter/indexers/hype.py (modified) (2 diffs)
- pyndexter/branches/refactoring/pyndexter/indexers/hyperestraier.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/indexers/__init__.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/indexers/lucene.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/indexers/lupy.py (modified) (4 diffs)
- pyndexter/branches/refactoring/pyndexter/indexers/swishe.py (added)
- pyndexter/branches/refactoring/pyndexter/indexers/xapian.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/__init__.py (modified) (20 diffs)
- pyndexter/branches/refactoring/pyndexter/sources/file.py (modified) (2 diffs)
- pyndexter/branches/refactoring/pyndexter/sources/__init__.py (modified) (1 diff)
- pyndexter/branches/refactoring/pyndexter/util.py (modified) (3 diffs)
- pyndexter/branches/refactoring/.todo (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/branches/refactoring/pyndexter/indexers/hype.py
r366 r367 42 42 for k, v in document.attributes.iteritems(): 43 43 if k != 'uri': 44 hdoc['@' + k] = v44 hdoc['@' + k] = unicode(v) 45 45 hdoc.add_text(document.content) 46 46 self.db.put_doc(hdoc) … … 89 89 90 90 91 indexer_factory = ComponentFactory(HypeIndexer, hype_mode=int,91 indexer_factory = PluginFactory(HypeIndexer, hype_mode=int, 92 92 enable_scoring=bool) 93 93 pyndexter/branches/refactoring/pyndexter/indexers/hyperestraier.py
r366 r367 102 102 103 103 104 indexer_factory = ComponentFactory(HyperestraierIndexer, hype_mode=int)104 indexer_factory = PluginFactory(HyperestraierIndexer, hype_mode=int) 105 105 106 106 pyndexter/branches/refactoring/pyndexter/indexers/__init__.py
r364 r367 6 6 # you should have received as part of this distribution. 7 7 # 8 9 __all__ = ['hyperestraier', 'hype', 'xapian', 'lucene', 'lupy', 'default', 'pyrex']pyndexter/branches/refactoring/pyndexter/indexers/lucene.py
r361 r367 7 7 # 8 8 9 import os 9 10 import PyLucene 10 11 from pyndexter import * 11 12 12 13 class LuceneIndexer(Indexer): 14 def __init__(self, framework, path): 15 self.path = path 16 self.db_path = os.path.join(path, 'lucene.db') 17 self.store_path = os.path.join(path, 'store.db') 18 19 self.lucene_store = PyLucene.FSDirectory.getDirectory(db_path, True) 20 21 if framework.mode == READWRITE: 22 analyser = PyLucene.StandardAnalyzer() 23 self.writer = PyLucene.IndexWriter(self.lucene_store, analyser, True) 24 #self.writer.setMaxFieldLength(1048576) # ?? 25 26 self.reader = PyLucene.IndexReader.open(self.path) 27 self.searcher = PyLucene.IndexSearcher(self.lucene_store) 28 29 def index(self, document): 30 doc = PyLucene.Document() 31 for k, v in document.attributes.iteritems(): 32 doc.add(PyLucene.Field(k, v, PyLucene.Field.Store.YES, 33 PyLucene.Field.Index.TOKENIZED)) 34 reader = PyLucene.StringReader(doc.content) 35 doc.add(Field('content', reader)) 36 self.writer.addDocument(doc) 37 38 def search(self, query): 39 query = self._compile_query(query) 40 query = PyLucene.QueryParser.parse(query, 'title', analyzer) 41 raise query 42 43 def discard(self): 44 pass 45 46 def close(self): 47 self.db.close() 48 49 def sync(self): 50 self.db.synchronize() 51 52 def state_store(self): 53 return StateStore(self.store_path) 54 55 # Internal methods 56 def _compile_query(self, node): 57 if not node or node.type == node.NULL: 58 return '' 59 if node.type == node.AND: 60 return '%s AND %s' % (self._compile_query(node.left), 61 self._compile_query(node.right)) 62 elif node.type == node.OR: 63 return '%s OR %s' % (self._compile_query(node.left), 64 self._compile_query(node.right)) 65 elif node.type == node.NOT: 66 return 'NOT %s' % self._compile_query(node.left) 67 elif node.type == node.TERM: 68 return node.value 69 else: 70 raise NotImplementedError 71 72 73 class LuceneResult(Result): 13 74 pass pyndexter/branches/refactoring/pyndexter/indexers/lupy.py
r366 r367 8 8 9 9 """ 10 Adapter for the (deprecated, but still available from11 http://gentoo.prz.rzeszow.pl/distfiles/Lupy-0.2.1.tar.gz )Lupy indexer.10 Adapter for the deprecated, but still available from 11 http://gentoo.prz.rzeszow.pl/distfiles/Lupy-0.2.1.tar.gz, Lupy indexer. 12 12 """ 13 13 … … 17 17 lupy.indexer = __import__('lupy.indexer', {}, {}, ['']) 18 18 lupy.search = __import__('lupy.search', {}, {}, ['']) 19 19 20 20 21 class LupyIndexer(Indexer): … … 31 32 32 33 33 def fetch(self, uri):34 hits = list(self.db.findInField(uri=uri))35 print hits36 37 34 def index(self, document): 38 35 attributes = dict([('_' + k.encode('utf-8'), str(v)) 39 36 for k, v in document.attributes.iteritems() 40 37 if v is not None]) 38 self.discard(uri=document.uri) 41 39 self.db.index(text=document.content, **attributes) 40 41 def discard(self, uri): 42 self.db.delete(uri=uri) 42 43 43 44 def search(self, query): … … 75 76 76 77 77 indexer_factory = ComponentFactory(LupyIndexer)78 indexer_factory = PluginFactory(LupyIndexer) 78 79 79 80 class LupyResult(Result): 80 81 def __iter__(self): 81 for hit in self.context: 82 fields = dict([(str(k), hit.get(k)) for k in hit.fieldNames]) 83 yield Hit(document=self.indexer.framework.fetch, 84 **fields) 82 for index, doc in enumerate(self.context): 83 yield self._translate(index, doc) 85 84 85 def __getitem__(self, index): 86 return self._translate(index, self.context[index]) 86 87 88 # Internal methods 89 def _translate(self, index, doc): 90 fields = dict([(str(k), doc.get(k)) for k in doc.fieldNames]) 91 fields['score'] = self.context.score(index) 92 return Hit(document=self.indexer.framework.fetch, 93 **fields) pyndexter/branches/refactoring/pyndexter/indexers/xapian.py
r366 r367 96 96 97 97 98 indexer_factory = ComponentFactory(XapianIndexer)98 indexer_factory = PluginFactory(XapianIndexer) 99 99 100 100 pyndexter/branches/refactoring/pyndexter/__init__.py
r366 r367 6 6 # you should have received as part of this distribution. 7 7 # 8 9 10 """ 11 Pyndexter provides a uniform API for accessing a variety of full-text 12 indexing engines. It is similar in purpose to the Python DB API. 13 14 The main class users will be dealing with is Framework. This class 15 ties indexers and sources of documents together and provides a mechanism 16 for performing automatic updates. 17 18 An example of indexing all .txt files underneath /usr/share/doc: 19 20 import os 21 from pyndexter import Framework 22 23 framework = Framework('hyperestraier:///tmp/hyperestraier.idx') 24 framework.add_source('file:///usr/share/doc?include=*.txt') 25 26 framework.update() 27 28 # Find all documents with Linus and Torvalds in them 29 for hit in framework.search('Linus Torvalds'): 30 print hit.uri 31 32 framework.close() 33 """ 34 8 35 9 36 import re … … 19 46 from sets import Set as set 20 47 48 21 49 __all__ = """ 22 50 Error … … 33 61 READONLY READWRITE 34 62 35 Query Framework Document Source Indexer Result Hit StateStore ComponentFactory63 Query Framework Document Source Indexer Result Hit StateStore PluginFactory 36 64 """.split() 65 37 66 38 67 # Source state difference constants … … 48 77 class Error(Exception): 49 78 """ Base of all pyndexter exceptions. """ 79 50 80 class DocumentNotFound(Error): 51 81 """ Raised when a document could not be found, usually by the fetch() 52 82 methods. """ 83 53 84 class InvalidURI(Error): 54 85 """ The URI provided was invalid in that context. """ 86 55 87 class SourceError(Error): 56 88 """ Base of all exceptions raised exclusively by Sources. """ 89 57 90 class InvalidState(SourceError): 58 91 """ The state provided to a source was invalid. """ 92 59 93 class IndexerError(Error): 60 94 """ Base of all exceptions raised exclusively by Indexers. """ 95 61 96 class InvalidMode(IndexerError): 62 97 """ The mode (READONLY or READWRITE) of the indexer is an 63 98 invalid state for a particular operation. """ 99 64 100 class InvalidQuery(Error): 65 101 """ Invalid query string. """ 102 66 103 class FrameworkError(Error): 67 104 """Base of Framework errors.""" 105 68 106 class InvalidModule(FrameworkError): 69 107 """The module provided was not loadable.""" … … 224 262 225 263 class QueryNode(object): 226 """ A query parse node. """ 264 """A query parse node. 265 266 >>> QueryNode(QueryNode.TERM, 'one') 267 ("one") 268 >>> QueryNode(QueryNode.AND, 269 ... left=QueryNode(QueryNode.TERM, 'one'), 270 ... right=QueryNode(QueryNode.TERM, 'two')) 271 (and 272 ("one") 273 ("two")) 274 >>> QueryNode(QueryNode.NOT, left=QueryNode(QueryNode.TERM, 'one')) 275 (not 276 ("one") 277 nil) 278 """ 279 227 280 228 281 NULL = 0 … … 300 353 """ 301 354 302 _tokenise = re.compile(r"(?P<ex>-)|(?P<or>or)|\"(?P<dq>(?:\\.|[^\"])*)\"|'(?P<sq>(?:\\.|[^'])*)'|(?P<te>(?:\S)+)", re.I)355 _tokenise_re = re.compile(r"(?P<ex>-)|(?P<or>or)|\"(?P<dq>(?:\\.|[^\"])*)\"|'(?P<sq>(?:\\.|[^'])*)'|(?P<te>(?:\S)+)", re.I) 303 356 _group_map = {'dq': QueryNode.TERM, 'sq': QueryNode.TERM, 'te': QueryNode.TERM, 304 357 'ex': QueryNode.NOT, 'or': QueryNode.OR} … … 306 359 def __init__(self, phrase): 307 360 QueryNode.__init__(self, None) 308 tokens = [(self._group_map[token.lastgroup], token.group(token.lastindex)) 309 for token in self._tokenise.finditer(phrase)] 361 tokens = self._tokenise(phrase) 310 362 root = self.parse(tokens) 311 363 self.phrase = phrase 312 364 if root: 365 # Make ourselves into the root node 313 366 for k in self.__slots__: 314 367 setattr(self, k, getattr(root, k)) … … 326 379 327 380 def parse_unary(self, tokens): 381 """Parse a unary operator. Currently only NOT. 382 383 >>> q = Query('') 384 >>> q.parse_unary(q._tokenise('-foo')) 385 (not 386 ("foo") 387 nil) 388 """ 328 389 if not tokens: 329 390 return None … … 334 395 335 396 def parse_terminal(self, tokens): 397 """Parse a terminal token. 398 399 >>> q = Query('') 400 >>> q.parse_terminal(q._tokenise('foo')) 401 ("foo") 402 """ 403 336 404 if not tokens: 337 405 raise InvalidQuery('Unexpected end of string') … … 341 409 raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) 342 410 411 # Internal methods 412 def _tokenise(self, phrase): 413 """Tokenise a phrase string. 414 415 >>> q = Query('') 416 >>> q._tokenise('one') 417 [(1, 'one')] 418 >>> q._tokenise('one two') 419 [(1, 'one'), (1, 'two')] 420 >>> q._tokenise('one or two') 421 [(1, 'one'), (4, 'or'), (1, 'two')] 422 >>> q._tokenise('"one two"') 423 [(1, 'one two')] 424 >>> q._tokenise("'one two'") 425 [(1, 'one two')] 426 >>> q._tokenise('-one') 427 [(2, '-'), (1, 'one')] 428 """ 429 tokens = [(self._group_map[token.lastgroup], token.group(token.lastindex)) 430 for token in self._tokenise_re.finditer(phrase)] 431 return tokens 343 432 344 433 class StateStore(object): … … 412 501 413 502 414 class ComponentFactory(object):503 class PluginFactory(object): 415 504 """Factory for translating URL-style query parameters into a standard 416 constructor call.""" 505 module constructor call. pyndexter modules always 506 507 >>> class C: 508 ... def __init__(self, one, two, three=3): 509 ... print one, two, three 510 >>> f = PluginFactory(C, three=int) 511 >>> c = f(one=1, two=2, three=3) 512 1 2 3 513 >>> c = f(one=1, two=2, three="three") 514 Traceback (most recent call last): 515 ... 516 ValueError: invalid literal for int(): three 517 """ 417 518 418 519 class List(object): 419 """Translate a parameter that is a list of elements of `type`.""" 420 def __init__(self, type): 520 """Translate a parameter that is a list of elements of `type`, 521 optionally splitting on commas.""" 522 def __init__(self, type, split=None): 421 523 self.type = type 524 self.split = split 422 525 423 526 def __call__(self, value): 424 return [self.type(v) for v in value] 425 426 def __init__(self, indexer, **arg_types): 527 if self.split: 528 out = [] 529 for v in value: 530 split_out += i.split(',') 531 return split_out 532 else: 533 return [self.type(v) for v in value] 534 535 def __init__(self, plugin, **arg_types): 427 536 """Create a new factory. 428 537 429 538 arg_types is a dictionary of <arg>:<type> mappings.""" 430 539 431 self. indexer = indexer540 self.plugin = plugin 432 541 self.arg_types = arg_types 433 542 args, varargs, self.varkw, defaults = \ 434 inspect.getargspec(self. indexer.__init__)543 inspect.getargspec(self.plugin.__init__) 435 544 defaults = defaults or [] 436 545 self.defaults = dict(zip(list(args[-len(defaults):]), defaults)) … … 438 547 self.args = defaults and args[:-len(defaults)] or args 439 548 440 def __call__(self, framework,**kwargs):549 def __call__(self, **kwargs): 441 550 args = dict(self.defaults.items()) 442 args['framework'] = framework443 551 args.update(kwargs) 444 552 … … 454 562 args[k] = type(v) 455 563 456 return self. indexer(**args)564 return self.plugin(**args) 457 565 458 566 … … 505 613 self._assert_rw() 506 614 if not self.state_store: 507 raise IndexerError("Source state storage path not defined, " 508 "Framework is not capable of automatic " 509 "updates.") 615 raise IndexerError("Indexer not capable of storing source state, " 616 "and store not provided to Framework - not " 617 "capable of automatic updates.") 618 if not filter: 619 def filter(context, stream): 620 for transition, uri in stream: 621 yield transition, uri 622 510 623 if self.state_store.exists(): 511 624 store = self.state_store.retrieve() 512 if not filter:513 def filter(context, stream):514 for transition, uri in stream:515 yield transition, uri516 625 for transition, uri in filter(context, 517 626 self.source.difference(store)): … … 521 630 self.index(uri) 522 631 else: 523 for uri in self.source: 632 def fake_difference(): 633 for uri in self.source: 634 yield ADDED, uri 635 636 for transition, uri in filter(context, fake_difference()): 524 637 self.index(uri) 525 638 … … 555 668 """ Sync and close the indexer. The object is subsequently not 556 669 usable. """ 557 self. indexer.sync()670 self.sync() 558 671 self.indexer.close() 559 672 … … 590 703 raise InvalidModule(module_name, e) 591 704 indexer_factory = getattr(module, type + '_factory') 592 assert isinstance(indexer_factory, ComponentFactory)593 return indexer_factory( self, **uri.query)705 assert isinstance(indexer_factory, PluginFactory) 706 return indexer_factory(framework=self, **uri.query) 594 707 595 708 def _assert_rw(self): … … 639 752 self.attributes['uri'] = uri 640 753 754 def get(self, key, default=None): 755 return self.attributes.get(key, default) 756 641 757 def __getattr__(self, key): 758 """Access hit attributes.""" 642 759 try: 643 760 return self.attributes[key] … … 646 763 647 764 def __contains__(self, key): 765 """Determine whether a Hit contains an attribute.""" 648 766 return key in self.attributes 649 767 … … 653 771 654 772 def _get_document(self): 773 """Fetch Document object using callback.""" 655 774 if callable(self._document): 656 775 self._document = self._document(self.uri) pyndexter/branches/refactoring/pyndexter/sources/file.py
r364 r367 6 6 # you should have received as part of this distribution. 7 7 # 8 9 """ 10 A document source for local filesystem. Accepts three 11 """ 8 12 9 13 import sys … … 88 92 89 93 90 source_factory = ComponentFactory(FileSource,91 include= ComponentFactory.List(str),92 exclude= ComponentFactory.List(str))94 source_factory = PluginFactory(FileSource, 95 include=PluginFactory.List(str), 96 exclude=PluginFactory.List(str)) pyndexter/branches/refactoring/pyndexter/sources/__init__.py
r361 r367 6 6 # you should have received as part of this distribution. 7 7 # 8 9 __all__ = ['file', 'metasource']pyndexter/branches/refactoring/pyndexter/util.py
r365 r367 71 71 PS. `urlparse` is not useful. """ 72 72 73 _pattern = re.compile(r'(?P<scheme>[^:]+)://(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^ /]*)(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?')73 _pattern = re.compile(r'(?P<scheme>[^:]+)://(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#]*)(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?') 74 74 75 75 __slots__ = ('scheme', 'username', 'password', 'host', 'path', 'query', … … 86 86 groups = groups[0:5] + (parse_qs(groups[5] or ''),) + groups[6:] 87 87 groups = [group or '' for group in groups] 88 if not groups[5]:89 groups[5] = {}90 88 else: 91 89 groups = [''] * 7 90 91 if not groups[5]: 92 groups[5] = {} 92 93 self.scheme, self.username, self.password, self.host, self.path, \ 93 94 self.query, self.fragment = groups 95 96 def __ne__(self, other): 97 return cmp(repr(self), repr(other)) 94 98 95 99 def __repr__(self): … … 104 108 if self.query: 105 109 uri += '?' + '&'.join(['&'.join(['%s=%s' % (k, v) for v in l]) 106 for k, l in s elf.query.items()])110 for k, l in sorted(self.query.items())]) 107 111 if self.fragment: 108 112 uri += '#' + self.fragment pyndexter/branches/refactoring/.todo
r365 r367 27 27 For storing state, perhaps there should be default store_state(store)/restore_state(store) methods. Also need a Store class, or just use a file object... 28 28 </note> 29 <note priority="high" time="1159197046" >29 <note priority="high" time="1159197046" done="1169000053"> 30 30 Refactor Indexer into two classes: the Indexer itself, and a class that glues Source and the Indexer together. This would remove the duplication I'm getting in all the stock methods (update, index, fetch, etc.) 31 <comment> 32 Done as the Framework class. 33 </comment> 31 34 </note> 32 <note priority="medium" time="1168868728" >35 <note priority="medium" time="1168868728" done="1169000047"> 33 36 Add slicing to Result objects. This will allow fast pagination in result displays. 34 37 </note> … … 36 39 Add some "stock" query translators (eg. a AND b OR c style, a b or c, +a +b c, etc.) 37 40 </note> 41 <note priority="medium" time="1169007320"> 42 Incremental updates for the indexer state. Waiting until the end of the index, then writing the state, is bad. A single document error can render the entire index useless. 43 <note priority="medium" time="1169007391"> 44 "Transactions" for state updates? 45 </note> 46 <note priority="medium" time="1169090428"> 47 I think an anydbm style interface for storing state could be useful. 48 </note> 49 </note> 50 <note priority="medium" time="1169048222"> 51 Add a swish-e adapter. The Python module SwishE only appears to expose searching :( 52 </note> 53 <note priority="medium" time="1169086953"> 54 Why is Xapian not returning all the hits? 55 </note> 56 <note priority="medium" time="1169116208"> 57 I'd like to add database Sources, but I can't see a way to handle updated rows without doing a full table scan. 58 </note> 38 59 </todo>
