Changeset 377
- Timestamp:
- 02/06/07 19:01:35 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (added)
- pyndexter/trunk/pyndexter/indexers/hype.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers/hyperestraier.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/indexers/lucene.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/lupy.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/indexers/pyndex.py (added)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (14 diffs)
- pyndexter/trunk/pyndexter/stemmers (added)
- pyndexter/trunk/pyndexter/stemmers/__init__.py (added)
- pyndexter/trunk/pyndexter/stemmers/porter.py (added)
- pyndexter/trunk/pyndexter/stemmers/snowball.py (added)
- pyndexter/trunk/pyndexter/util.py (modified) (2 diffs)
- pyndexter/trunk/.todo (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/hype.py
r374 r377 53 53 54 54 def search(self, query): 55 query = query. to_boolean(not_='ANDNOT ').decode('utf-8')55 query = query.as_string(not_='ANDNOT ').decode('utf-8') 56 56 print query 57 57 search = self.db.search(query) … … 68 68 def close(self): 69 69 self.db = None 70 71 def state_store(self):72 return StateStore(self.state_path)73 70 74 71 pyndexter/trunk/pyndexter/indexers/hyperestraier.py
r376 r377 58 58 self.db.out_doc(id, HyperEstraier.Database.ODCLEAN) 59 59 60 def state_store(self):61 return StateStore(self.state_path)62 63 60 def search(self, query): 64 phrase = query. to_boolean(not_='ANDNOT ')61 phrase = query.as_string(not_='ANDNOT ') 65 62 return self.hype_search(phrase, simple=False) 66 63 pyndexter/trunk/pyndexter/indexers/lucene.py
r376 r377 16 16 self.path = path 17 17 self.db_path = os.path.join(path, 'lucene.db') 18 self.st ore_path = os.path.join(path, 'store.db')18 self.state_path = os.path.join(path, 'store.db') 19 19 20 20 create = not os.path.exists(self.db_path) and framework.mode == READWRITE … … 43 43 44 44 def search(self, query): 45 query = query. to_boolean()45 query = query.as_string() 46 46 searcher = PyLucene.IndexSearcher(self.lucene_store) 47 47 query = PyLucene.QueryParser('content', self.analyzer).parse(query) … … 69 69 self.writer.close() 70 70 71 def state_store(self):72 return StateStore(self.store_path)73 74 71 75 72 indexer_factory = PluginFactory(LuceneIndexer) pyndexter/trunk/pyndexter/indexers/lupy.py
r374 r377 55 55 self.db.close() 56 56 57 def state_store(self):58 return StateStore(self.state_path)59 60 57 # Internal methods 61 58 def _compile_query(self, node, op, query): pyndexter/trunk/pyndexter/indexers/xapian.py
r374 r377 21 21 22 22 class XapianIndexer(Indexer): 23 def __init__(self, framework, path, stemmer='english', words=r'\w+', 24 max_term_length=240): 23 def __init__(self, framework, path, words=r'\w+', max_word_length=240): 25 24 Indexer.__init__(self, framework) 26 self.stemmer = xapian.Stem(stemmer)27 25 self.words = re.compile(words) 28 self.max_ term_length = max_term_length26 self.max_word_length = max_word_length 29 27 30 28 path = path.encode('utf-8') … … 52 50 doc.add_term('Q' + uri) 53 51 52 words = [self.framework.stemmer(w.lower()) 53 for w in set(self.words.findall(content))] 54 54 for word in self.words.finditer(content): 55 term = self. stemmer.stem_word(word.group().lower())56 if len(term) > self.max_ term_length:55 term = self.framework.stemmer(word.group().lower()) 56 if len(term) > self.max_word_length: 57 57 continue 58 58 doc.add_posting(term, word.start()) … … 75 75 76 76 def search(self, query): 77 78 # Fake stemmer to use the frameworks 79 framework = self.framework 80 class StemmerWrapper(xapian.Stem): 81 def stem_word(self, word): 82 return framework.stemmer(word) 83 77 84 query_parser = xapian.QueryParser() 78 query_parser.set_stemmer( self.stemmer)79 query = query_parser.parse_query(query. to_boolean().encode('utf-8'))85 query_parser.set_stemmer(StemmerWrapper('english')) 86 query = query_parser.parse_query(query.as_string().encode('utf-8').lower()) 80 87 enquire = xapian.Enquire(self.db) 81 88 enquire.set_query(query) 82 89 return XapianResult(self, enquire) 83 90 84 def state_store(self):85 return StateStore(self.state_path)86 91 87 88 indexer_factory = PluginFactory(XapianIndexer) 92 indexer_factory = PluginFactory(XapianIndexer, max_word_length=int) 89 93 90 94 pyndexter/trunk/pyndexter/__init__.py
r376 r377 41 41 from StringIO import StringIO 42 42 from urlparse import urlsplit, urlunsplit 43 try: 44 set = set 45 except NameError: 46 from sets import Set as set 43 from pyndexter.util import set 47 44 48 45 … … 417 414 raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) 418 415 419 def to_boolean(self, and_=' AND ', or_=' OR ', not_='NOT '):416 def as_string(self, and_=' AND ', or_=' OR ', not_='NOT '): 420 417 """Convert Query to a boolean expression. Useful for indexers with 421 418 "typical" boolean query syntaxes. … … 425 422 The expanded operators can be customised for syntactical variations. 426 423 427 >>> Query('foo bar'). to_boolean()424 >>> Query('foo bar').as_string() 428 425 'foo AND bar' 429 >>> Query('foo bar or baz'). to_boolean()426 >>> Query('foo bar or baz').as_string() 430 427 'foo AND bar OR baz' 431 >>> Query('foo -bar or baz'). to_boolean()428 >>> Query('foo -bar or baz').as_string() 432 429 'foo AND NOT bar OR baz' 433 430 """ … … 538 535 def state_store(self): 539 536 """If this Indexer is capable of storing framework state, return a 540 `StateStore` object.""" 537 `StateStore` object. By default, if the indexer has a `state_path` 538 attribute, a new `StateStore` object will be returned on that path.""" 539 if hasattr(self, 'state_path'): 540 return StateStore(self.state_path) 541 541 return None 542 543 542 544 543 class PluginFactory(object): … … 549 548 ... def __init__(self, one, two, three=3): 550 549 ... print one, two, three 551 >>> f = PluginFactory(C, three=int )550 >>> f = PluginFactory(C, three=int, four="three") 552 551 >>> c = f(one=1, two=2, three=3) 553 552 1 2 3 554 >>> c = f( one=1, two=2, three="three")553 >>> c = f(uri='scheme://?one=1&two=2&three=three') 555 554 Traceback (most recent call last): 556 555 ... 557 ValueError: invalid literal for int(): three 556 ValueError: could not coerce argument "three" with value "three" to type "<type 'int'>": invalid literal for int(): three 557 >>> c = f(uri='scheme://?one=1&two=2&four=3') 558 1 2 3 558 559 """ 559 560 … … 577 578 """Create a new factory. 578 579 579 arg_types is a dictionary of <arg>:<type> mappings.""" 580 arg_types is a dictionary of <arg>:<type> mappings. If <type> is a 581 string, <arg> will be renamed to this before calling the plugin 582 constructor.""" 580 583 581 584 self.plugin = plugin 585 self.remapped = dict([(k, v) for k, v in arg_types.iteritems() 586 if isinstance(v, basestring)]) 582 587 self.arg_types = arg_types 583 588 args, varargs, self.varkw, defaults = \ … … 588 593 self.args = defaults and args[:-len(defaults)] or args 589 594 590 def __call__(self, **kwargs):595 def __call__(self, uri=None, **kwargs): 591 596 args = dict(self.defaults.items()) 597 598 if uri is not None: 599 # Merge URI arguments 600 if isinstance(uri, basestring): 601 from pyndexter.util import URI 602 uri = URI(uri) 603 604 uri.username = uri.username or None 605 uri.password = uri.password or None 606 uri_components = {'username': uri.username, 'password': uri.password, 607 'host': uri.host, 'path': uri.path, 608 'fragment': uri.fragment} 609 # Discard them if they're empty 610 uri_components = dict([(k, v) for k, v in uri_components.iteritems() if v]) 611 args.update(uri.query) 612 args.update(uri_components) 613 614 # Add keyword arguments 592 615 args.update(kwargs) 616 617 # Remap (rename) arguments 618 for k, v in self.remapped.iteritems(): 619 if k in args: 620 args[v] = args[k] 621 del args[k] 593 622 594 623 # Translate all remaining arguments 595 624 for k, v in args.items(): 596 if v is not None and k in self.arg_types:597 type = self.arg_types [k]625 if v is not None: 626 type = self.arg_types.get(k, lambda v: v) 598 627 # If it's a list, and not marked as such, convert it to a scalar 599 628 if isinstance(v, (tuple, list)) and not isinstance(type, self.List): … … 601 630 raise ValueError('argument "%s" should be a scalar' % k) 602 631 v = v[0] 603 args[k] = type(v) 632 try: 633 args[k] = type(v) 634 except ValueError, e: 635 raise ValueError('could not coerce argument "%s" with ' 636 'value "%s" to type "%s": %s' 637 % (k, v, type, e)) 604 638 605 639 return self.plugin(**args) … … 611 645 612 646 If the `Indexer` is not capable of storing state and automatic updates are 613 desired, a `StateStore` object should be passed to the `Framework`.""" 614 615 def __init__(self, indexer, sources=[], mode=READWRITE, 616 indexer_args={}, state_store=None): 647 desired, a `StateStore` object should be passed to the `Framework`. 648 649 `indexer` is a URI used to construct an indexer, or an `Indexer` object. 650 651 `stemmer` is a callable that stems individual words. Indexers can 652 optionally use this, though some may have their own stemming mechanisms, 653 typically passed as a URI parameter.""" 654 655 def __init__(self, indexer, sources=[], mode=READWRITE, state_store=None, 656 stemmer=None): 617 657 self.mode = mode 618 658 619 self.indexer = self._load_plugin('indexer', indexer, indexer_args) 659 if isinstance(indexer, basestring): 660 self.indexer = self._load_plugin('indexer', indexer) 661 self.indexer = self.indexer(framework=self, uri=indexer) 662 else: 663 self.indexer = indexer 664 665 if stemmer is None: 666 self.stemmer = lambda word: word 667 elif isinstance(stemmer, basestring): 668 self.stemmer = self._load_plugin('stemmer', stemmer) 669 self.stemmer = self.stemmer(uri=stemmer) 670 else: 671 self.stemmer = stemmer 620 672 621 673 if state_store is None: … … 624 676 self.state_store = state_store 625 677 626 sources = [self._load_plugin('source', source) for source in sources] 678 sources = [self._load_plugin('source', source)(framework=self, uri=source) 679 for source in sources] 627 680 628 681 from pyndexter.sources.metasource import MetaSource … … 631 684 self.add_source(source) 632 685 633 def add_source(self, source , source_args={}):686 def add_source(self, source): 634 687 """ Add a source to be indexed to the framework. Can either be a 635 688 `Source` instance or a URI.""" 636 689 if isinstance(source, basestring): 637 source = self._load_plugin('source', source, source_args) 690 Source = self._load_plugin('source', source) 691 source = Source(framework=self, uri=source) 638 692 self.source.add_source(source) 639 693 … … 668 722 if transition == REMOVED: 669 723 self.discard(uri) 724 elif transition == MODIFIED: 725 self.update(uri) 670 726 else: 671 727 self.index(uri) … … 725 781 726 782 # Helper methods 727 def _load_plugin(self, type, uri , args={}):783 def _load_plugin(self, type, uri): 728 784 from pyndexter.util import URI 729 # Extract URI components730 785 uri = URI(uri) 731 uri.username = uri.username or None732 uri.password = uri.password or None733 uri_components = {'username': uri.username, 'password': uri.password,734 'host': uri.host, 'path': uri.path,735 'fragment': uri.fragment}736 # Discard them if they're empty737 uri_components = dict([(k, v) for k, v in uri_components.iteritems() if v])738 uri.query.update(uri_components)739 uri.query.update(args)740 786 try: 741 787 module_name = 'pyndexter.%ss.%s' % (type, uri.scheme) … … 745 791 indexer_factory = getattr(module, type + '_factory') 746 792 assert isinstance(indexer_factory, PluginFactory) 747 return indexer_factory (framework=self, **uri.query)793 return indexer_factory 748 794 749 795 def _assert_rw(self): pyndexter/trunk/pyndexter/util.py
r374 r377 7 7 # 8 8 9 import time10 9 import re 11 from UserDict import DictMixin12 10 try: 13 11 set = set 14 12 except: 15 13 from sets import Set as set 14 from sets import ImmutableSet as frozenset 16 15 17 16 … … 70 69 uri += '#' + self.fragment 71 70 return uri 71 72 def stem_text(words_re, stemmer, min_word_length=3, max_word_length=64): 73 """Stem all words in a document. 74 75 `words_re` is a compiled re object, `stemmer` is a callable returning a 76 stemmed word.""" 77 from StringIO import StringIO 78 out = StringIO() 79 for word in words_re.findall(document.content): 80 pass pyndexter/trunk/.todo
r376 r377 57 57 </note> 58 58 </note> 59 <note priority="medium" time="1169048222" >59 <note priority="medium" time="1169048222" done="1170655393"> 60 60 Add a swish-e adapter. The Python module SwishE only appears to expose searching :( 61 <comment> 62 Done, but only for searching. 63 </comment> 61 64 </note> 62 65 <note priority="medium" time="1169086953"> … … 71 74 <note priority="medium" time="1170604364"> 72 75 Deprecate Hit and just use Document - they're almost identical in functionality. 76 <note priority="medium" time="1170812979"> 77 Perhaps Results should use the framework to try and fetch a Document, then "underlay" the hit attributes? 78 </note> 73 79 </note> 74 80 <note priority="medium" time="1170651530"> … … 81 87 How do we detect when sources have been removed from the index? If file:///tmp changes to file:///usr, the Framework has no real way of detecting which URI's in the index are no longer valid. 82 88 </note> 89 <note priority="medium" time="1170685227"> 90 Default indexer tasks 91 <note priority="medium" time="1170685251"> 92 Abstract storage mechanism so that sqlite, metakit, anydbm, etc. can be used. This would allow for wide use. 93 </note> 94 <note priority="medium" time="1170685266"> 95 Use bigrams same as the current 'default' search? This is a good solution I think. Allows for sub-word searches, start and end of word searches, etc. 96 </note> 97 <note priority="medium" time="1170685271"> 98 Optionally use snowball stemmer. 99 </note> 100 <note priority="medium" time="1170685277"> 101 Have a built-in stemmer? Porter? 102 </note> 103 <note priority="medium" time="1170685318"> 104 Use "nltk" stemmer? 105 </note> 106 </note> 107 <note priority="medium" time="1170686012"> 108 http://www.biais.org/blog/index.php/2007/01/31/25-spelling-correction-using-the-python-natural-language-toolkit-nltk <- interesting 109 </note> 110 <note priority="medium" time="1170739349"> 111 Pyndex adapter. 112 </note> 113 <note priority="medium" time="1170813131"> 114 Add utility function for converting attribute dictionary keys to plain strings (common pattern). 115 </note> 83 116 </todo>
