Changeset 377

Show
Ignore:
Timestamp:
02/06/07 19:01:35 (2 years ago)
Author:
athomas
Message:

pyndexter:

  • Added preliminary stemming interface, with a Porter stemmer built in, plus
    an adapter for the Snowball stemmer.
  • Factored state_store() implementations into Indexer base class.
  • Xapian adapter modified to use stemming interface.
  • Implemented a basic inverted index builtin indexer. No scoring, no substring matching. Quite fast though.
  • Added a Pyndex adapter.
  • The PluginFactory object now takes care of URI parsing instead of Factory.
  • Deprecated *_args being passed to Factory plugin loading methods. Instances of each plugin can simply be passed instead.
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pyndexter/trunk/pyndexter/indexers/hype.py

    r374 r377  
    5353 
    5454    def search(self, query): 
    55         query = query.to_boolean(not_='ANDNOT ').decode('utf-8') 
     55        query = query.as_string(not_='ANDNOT ').decode('utf-8') 
    5656        print query 
    5757        search = self.db.search(query) 
     
    6868    def close(self): 
    6969        self.db = None 
    70  
    71     def state_store(self): 
    72         return StateStore(self.state_path) 
    7370 
    7471 
  • pyndexter/trunk/pyndexter/indexers/hyperestraier.py

    r376 r377  
    5858        self.db.out_doc(id, HyperEstraier.Database.ODCLEAN) 
    5959 
    60     def state_store(self): 
    61         return StateStore(self.state_path) 
    62  
    6360    def search(self, query): 
    64         phrase = query.to_boolean(not_='ANDNOT ') 
     61        phrase = query.as_string(not_='ANDNOT ') 
    6562        return self.hype_search(phrase, simple=False) 
    6663 
  • pyndexter/trunk/pyndexter/indexers/lucene.py

    r376 r377  
    1616        self.path = path 
    1717        self.db_path = os.path.join(path, 'lucene.db') 
    18         self.store_path = os.path.join(path, 'store.db') 
     18        self.state_path = os.path.join(path, 'store.db') 
    1919 
    2020        create = not os.path.exists(self.db_path) and framework.mode == READWRITE 
     
    4343 
    4444    def search(self, query): 
    45         query = query.to_boolean() 
     45        query = query.as_string() 
    4646        searcher = PyLucene.IndexSearcher(self.lucene_store) 
    4747        query = PyLucene.QueryParser('content', self.analyzer).parse(query) 
     
    6969            self.writer.close() 
    7070 
    71     def state_store(self): 
    72         return StateStore(self.store_path) 
    73  
    7471 
    7572indexer_factory = PluginFactory(LuceneIndexer) 
  • pyndexter/trunk/pyndexter/indexers/lupy.py

    r374 r377  
    5555        self.db.close() 
    5656 
    57     def state_store(self): 
    58         return StateStore(self.state_path) 
    59  
    6057    # Internal methods 
    6158    def _compile_query(self, node, op, query): 
  • pyndexter/trunk/pyndexter/indexers/xapian.py

    r374 r377  
    2121 
    2222class XapianIndexer(Indexer): 
    23     def __init__(self, framework, path, stemmer='english', words=r'\w+', 
    24                  max_term_length=240): 
     23    def __init__(self, framework, path, words=r'\w+', max_word_length=240): 
    2524        Indexer.__init__(self, framework) 
    26         self.stemmer = xapian.Stem(stemmer) 
    2725        self.words = re.compile(words) 
    28         self.max_term_length = max_term_length 
     26        self.max_word_length = max_word_length 
    2927 
    3028        path = path.encode('utf-8') 
     
    5250        doc.add_term('Q' + uri) 
    5351 
     52        words = [self.framework.stemmer(w.lower()) 
     53                 for w in set(self.words.findall(content))] 
    5454        for word in self.words.finditer(content): 
    55             term = self.stemmer.stem_word(word.group().lower()) 
    56             if len(term) > self.max_term_length: 
     55            term = self.framework.stemmer(word.group().lower()) 
     56            if len(term) > self.max_word_length: 
    5757               continue 
    5858            doc.add_posting(term, word.start()) 
     
    7575 
    7676    def search(self, query): 
     77 
     78        # Fake stemmer to use the frameworks 
     79        framework = self.framework 
     80        class StemmerWrapper(xapian.Stem): 
     81            def stem_word(self, word): 
     82                return framework.stemmer(word) 
     83 
    7784        query_parser = xapian.QueryParser() 
    78         query_parser.set_stemmer(self.stemmer
    79         query = query_parser.parse_query(query.to_boolean().encode('utf-8')) 
     85        query_parser.set_stemmer(StemmerWrapper('english')
     86        query = query_parser.parse_query(query.as_string().encode('utf-8').lower()) 
    8087        enquire = xapian.Enquire(self.db) 
    8188        enquire.set_query(query) 
    8289        return XapianResult(self, enquire) 
    8390 
    84     def state_store(self): 
    85         return StateStore(self.state_path) 
    8691 
    87  
    88 indexer_factory = PluginFactory(XapianIndexer) 
     92indexer_factory = PluginFactory(XapianIndexer, max_word_length=int) 
    8993 
    9094 
  • pyndexter/trunk/pyndexter/__init__.py

    r376 r377  
    4141from StringIO import StringIO 
    4242from urlparse import urlsplit, urlunsplit 
    43 try: 
    44     set = set 
    45 except NameError: 
    46     from sets import Set as set 
     43from pyndexter.util import set 
    4744 
    4845 
     
    417414        raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) 
    418415 
    419     def to_boolean(self, and_=' AND ', or_=' OR ', not_='NOT '): 
     416    def as_string(self, and_=' AND ', or_=' OR ', not_='NOT '): 
    420417        """Convert Query to a boolean expression. Useful for indexers with 
    421418        "typical" boolean query syntaxes. 
     
    425422        The expanded operators can be customised for syntactical variations. 
    426423 
    427         >>> Query('foo bar').to_boolean() 
     424        >>> Query('foo bar').as_string() 
    428425        'foo AND bar' 
    429         >>> Query('foo bar or baz').to_boolean() 
     426        >>> Query('foo bar or baz').as_string() 
    430427        'foo AND bar OR baz' 
    431         >>> Query('foo -bar or baz').to_boolean() 
     428        >>> Query('foo -bar or baz').as_string() 
    432429        'foo AND NOT bar OR baz' 
    433430        """ 
     
    538535    def state_store(self): 
    539536        """If this Indexer is capable of storing framework state, return a 
    540         `StateStore` object.""" 
     537        `StateStore` object. By default, if the indexer has a `state_path` 
     538        attribute, a new `StateStore` object will be returned on that path.""" 
     539        if hasattr(self, 'state_path'): 
     540            return StateStore(self.state_path) 
    541541        return None 
    542  
    543542 
    544543class PluginFactory(object): 
     
    549548    ...   def __init__(self, one, two, three=3): 
    550549    ...     print one, two, three 
    551     >>> f = PluginFactory(C, three=int
     550    >>> f = PluginFactory(C, three=int, four="three"
    552551    >>> c = f(one=1, two=2, three=3) 
    553552    1 2 3 
    554     >>> c = f(one=1, two=2, three="three"
     553    >>> c = f(uri='scheme://?one=1&two=2&three=three'
    555554    Traceback (most recent call last): 
    556555    ... 
    557     ValueError: invalid literal for int(): three 
     556    ValueError: could not coerce argument "three" with value "three" to type "<type 'int'>": invalid literal for int(): three 
     557    >>> c = f(uri='scheme://?one=1&two=2&four=3') 
     558    1 2 3 
    558559    """ 
    559560 
     
    577578        """Create a new factory. 
    578579 
    579         arg_types is a dictionary of <arg>:<type> mappings.""" 
     580        arg_types is a dictionary of <arg>:<type> mappings. If <type> is a 
     581        string, <arg> will be renamed to this before calling the plugin 
     582        constructor.""" 
    580583 
    581584        self.plugin = plugin 
     585        self.remapped = dict([(k, v) for k, v in arg_types.iteritems() 
     586                              if isinstance(v, basestring)]) 
    582587        self.arg_types = arg_types 
    583588        args, varargs, self.varkw, defaults = \ 
     
    588593        self.args = defaults and args[:-len(defaults)] or args 
    589594 
    590     def __call__(self, **kwargs): 
     595    def __call__(self, uri=None, **kwargs): 
    591596        args = dict(self.defaults.items()) 
     597 
     598        if uri is not None: 
     599            # Merge URI arguments 
     600            if isinstance(uri, basestring): 
     601                from pyndexter.util import URI 
     602                uri = URI(uri) 
     603 
     604            uri.username = uri.username or None 
     605            uri.password = uri.password or None 
     606            uri_components = {'username': uri.username, 'password': uri.password, 
     607                              'host': uri.host, 'path': uri.path, 
     608                              'fragment': uri.fragment} 
     609            # Discard them if they're empty 
     610            uri_components = dict([(k, v) for k, v in uri_components.iteritems() if v]) 
     611            args.update(uri.query) 
     612            args.update(uri_components) 
     613 
     614        # Add keyword arguments 
    592615        args.update(kwargs) 
     616 
     617        # Remap (rename) arguments 
     618        for k, v in self.remapped.iteritems(): 
     619            if k in args: 
     620                args[v] = args[k] 
     621                del args[k] 
    593622 
    594623        # Translate all remaining arguments 
    595624        for k, v in args.items(): 
    596             if v is not None and k in self.arg_types
    597                 type = self.arg_types[k] 
     625            if v is not None
     626                type = self.arg_types.get(k, lambda v: v) 
    598627                # If it's a list, and not marked as such, convert it to a scalar 
    599628                if isinstance(v, (tuple, list)) and not isinstance(type, self.List): 
     
    601630                        raise ValueError('argument "%s" should be a scalar' % k) 
    602631                    v = v[0] 
    603                 args[k] = type(v) 
     632                try: 
     633                    args[k] = type(v) 
     634                except ValueError, e: 
     635                    raise ValueError('could not coerce argument "%s" with ' 
     636                                     'value "%s" to type "%s": %s' 
     637                                     % (k, v, type, e)) 
    604638 
    605639        return self.plugin(**args) 
     
    611645 
    612646    If the `Indexer` is not capable of storing state and automatic updates are 
    613     desired, a `StateStore` object should be passed to the `Framework`.""" 
    614  
    615     def __init__(self, indexer, sources=[], mode=READWRITE, 
    616                  indexer_args={}, state_store=None): 
     647    desired, a `StateStore` object should be passed to the `Framework`. 
     648 
     649    `indexer` is a URI used to construct an indexer, or an `Indexer` object. 
     650 
     651    `stemmer` is a callable that stems individual words. Indexers can 
     652    optionally use this, though some may have their own stemming mechanisms, 
     653    typically passed as a URI parameter.""" 
     654 
     655    def __init__(self, indexer, sources=[], mode=READWRITE, state_store=None, 
     656                 stemmer=None): 
    617657        self.mode = mode 
    618658 
    619         self.indexer = self._load_plugin('indexer', indexer, indexer_args) 
     659        if isinstance(indexer, basestring): 
     660            self.indexer = self._load_plugin('indexer', indexer) 
     661            self.indexer = self.indexer(framework=self, uri=indexer) 
     662        else: 
     663            self.indexer = indexer 
     664 
     665        if stemmer is None: 
     666            self.stemmer = lambda word: word 
     667        elif isinstance(stemmer, basestring): 
     668            self.stemmer = self._load_plugin('stemmer', stemmer) 
     669            self.stemmer = self.stemmer(uri=stemmer) 
     670        else: 
     671            self.stemmer = stemmer 
    620672 
    621673        if state_store is None: 
     
    624676            self.state_store = state_store 
    625677 
    626         sources = [self._load_plugin('source', source) for source in sources] 
     678        sources = [self._load_plugin('source', source)(framework=self, uri=source) 
     679                   for source in sources] 
    627680 
    628681        from pyndexter.sources.metasource import MetaSource 
     
    631684            self.add_source(source) 
    632685 
    633     def add_source(self, source, source_args={}): 
     686    def add_source(self, source): 
    634687        """ Add a source to be indexed to the framework. Can either be a 
    635688        `Source` instance or a URI.""" 
    636689        if isinstance(source, basestring): 
    637             source = self._load_plugin('source', source, source_args) 
     690            Source = self._load_plugin('source', source) 
     691            source = Source(framework=self, uri=source) 
    638692        self.source.add_source(source) 
    639693 
     
    668722                if transition == REMOVED: 
    669723                    self.discard(uri) 
     724                elif transition == MODIFIED: 
     725                    self.update(uri) 
    670726                else: 
    671727                    self.index(uri) 
     
    725781 
    726782    # Helper methods 
    727     def _load_plugin(self, type, uri, args={}): 
     783    def _load_plugin(self, type, uri): 
    728784        from pyndexter.util import URI 
    729         # Extract URI components 
    730785        uri = URI(uri) 
    731         uri.username = uri.username or None 
    732         uri.password = uri.password or None 
    733         uri_components = {'username': uri.username, 'password': uri.password, 
    734                           'host': uri.host, 'path': uri.path, 
    735                           'fragment': uri.fragment} 
    736         # Discard them if they're empty 
    737         uri_components = dict([(k, v) for k, v in uri_components.iteritems() if v]) 
    738         uri.query.update(uri_components) 
    739         uri.query.update(args) 
    740786        try: 
    741787            module_name = 'pyndexter.%ss.%s' % (type, uri.scheme) 
     
    745791        indexer_factory = getattr(module, type + '_factory') 
    746792        assert isinstance(indexer_factory, PluginFactory) 
    747         return indexer_factory(framework=self, **uri.query) 
     793        return indexer_factory 
    748794 
    749795    def _assert_rw(self): 
  • pyndexter/trunk/pyndexter/util.py

    r374 r377  
    77# 
    88 
    9 import time 
    109import re 
    11 from UserDict import DictMixin 
    1210try: 
    1311    set = set 
    1412except: 
    1513    from sets import Set as set 
     14    from sets import ImmutableSet as frozenset 
    1615 
    1716 
     
    7069            uri += '#' + self.fragment 
    7170        return uri 
     71 
     72def stem_text(words_re, stemmer, min_word_length=3, max_word_length=64): 
     73    """Stem all words in a document. 
     74 
     75    `words_re` is a compiled re object, `stemmer` is a callable returning a 
     76    stemmed word.""" 
     77    from StringIO import StringIO 
     78    out = StringIO() 
     79    for word in words_re.findall(document.content): 
     80        pass 
  • pyndexter/trunk/.todo

    r376 r377  
    5757        </note> 
    5858    </note> 
    59     <note priority="medium" time="1169048222"
     59    <note priority="medium" time="1169048222" done="1170655393"
    6060        Add a swish-e adapter. The Python module SwishE only appears to expose searching :( 
     61        <comment> 
     62            Done, but only for searching. 
     63        </comment> 
    6164    </note> 
    6265    <note priority="medium" time="1169086953"> 
     
    7174    <note priority="medium" time="1170604364"> 
    7275        Deprecate Hit and just use Document - they're almost identical in functionality. 
     76        <note priority="medium" time="1170812979"> 
     77            Perhaps Results should use the framework to try and fetch a Document, then "underlay" the hit attributes? 
     78        </note> 
    7379    </note> 
    7480    <note priority="medium" time="1170651530"> 
     
    8187        How do we detect when sources have been removed from the index? If file:///tmp changes to file:///usr, the Framework has no real way of detecting which URI's in the index are no longer valid. 
    8288    </note> 
     89    <note priority="medium" time="1170685227"> 
     90        Default indexer tasks 
     91        <note priority="medium" time="1170685251"> 
     92            Abstract storage mechanism so that sqlite, metakit, anydbm, etc. can be used. This would allow for wide use. 
     93        </note> 
     94        <note priority="medium" time="1170685266"> 
     95            Use bigrams same as the current 'default' search? This is a good solution I think. Allows for sub-word searches, start and end of word searches, etc. 
     96        </note> 
     97        <note priority="medium" time="1170685271"> 
     98            Optionally use snowball stemmer. 
     99        </note> 
     100        <note priority="medium" time="1170685277"> 
     101            Have a built-in stemmer? Porter? 
     102        </note> 
     103        <note priority="medium" time="1170685318"> 
     104            Use "nltk" stemmer? 
     105        </note> 
     106    </note> 
     107    <note priority="medium" time="1170686012"> 
     108        http://www.biais.org/blog/index.php/2007/01/31/25-spelling-correction-using-the-python-natural-language-toolkit-nltk &lt;- interesting 
     109    </note> 
     110    <note priority="medium" time="1170739349"> 
     111        Pyndex adapter. 
     112    </note> 
     113    <note priority="medium" time="1170813131"> 
     114        Add utility function for converting attribute dictionary keys to plain strings (common pattern). 
     115    </note> 
    83116</todo>