Changeset 354

Show
Ignore:
Timestamp:
09/25/06 07:49:51 (2 years ago)
Author:
athomas
Message:

pyndexter:

  • Added copyright and encoding messages
  • All known indexers and sources can now be imported from pyndexter.indexers and pyndexter.sources, respectively.
  • Fixes of Xapian indexer for #26 begun. It is not returning all document hits at the moment though, which is odd.
  • Updated to latest Hype API.
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pyndexter/trunk/pyndexter/default.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import re 
    210import os 
  • pyndexter/trunk/pyndexter/file.py

    r353 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import sys 
    210import codecs 
  • pyndexter/trunk/pyndexter/hyperestraier.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import os 
    210import hype 
     
    3442        if isinstance(document, basestring): 
    3543            document = self.fetch(document) 
    36         hdoc = hype.Document(
     44        hdoc = hype.Document(document.uri
    3745        for k, v in document.attributes.iteritems(): 
    38             hdoc['@' + k] = v 
     46            if k != 'uri': 
     47                hdoc['@' + k] = v 
    3948        hdoc.add_text(document.content) 
    4049        self.db.put_doc(hdoc) 
  • pyndexter/trunk/pyndexter/__init__.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import os 
    210import pickle 
  • pyndexter/trunk/pyndexter/lucene.py

    r322 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import PyLucene 
    210from pyndexter import * 
  • pyndexter/trunk/pyndexter/metasource.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19from pyndexter import * 
    210from urlparse import urlsplit 
  • pyndexter/trunk/pyndexter/util.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import time 
    210from UserDict import DictMixin 
  • pyndexter/trunk/pyndexter/xapian.py

    r345 r354  
     1# -*- coding: utf-8 -*- 
     2# 
     3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 
     4# 
     5# This software is licensed as described in the file COPYING, which 
     6# you should have received as part of this distribution. 
     7# 
     8 
    19import os 
     10import re 
    211from pyndexter import * 
    3 import xapwrap.index as xi 
    4 import xapwrap.document as xd 
    5  
    6  
    7 # XXX Is a numeric ID the only way to uniquely identify documents in Xapian? 
    8 # XXX This seems crazy, and prone to error. 
    9 def uri2id(uri): 
    10     return abs(hash(uri)) 
    11  
    12  
    13 xd.Document.registerFlattener(long, xd.flattenNumeric) 
    14 xd.Document.registerFlattener(float, xd.flattenNumeric) 
     12xapian = __import__('xapian') 
    1513 
    1614 
     
    2018                   CAP_INTERSECTION 
    2119 
    22     def __init__(self, path, source=None, mode=READWRITE): 
     20    def __init__(self, path, source=None, mode=READWRITE, stemmer='english', 
     21                 words=r'\w+'): 
    2322        Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 
    2423        self.path = path 
     
    2625        self.idx_path = os.path.join(path, 'xapian.db') 
    2726        if mode == READWRITE: 
    28             self.idx = xi.SmartIndex(self.idx_path, True) 
     27            self.db = xapian.WritableDatabase(self.idx_path, 
     28                                              xapian.DB_CREATE_OR_OPEN) 
    2929        else: 
    30             self.idx = xi.SmartReadOnlyIndex(self.idx_path) 
     30            self.db = xapian.Database(self.idx_path) 
     31        self.stemmer = xapian.Stem('english') 
     32        self.words = re.compile(words) 
     33 
    3134 
    3235    def index(self, document): 
     
    3538            document = self.fetch(document) 
    3639 
    37         sort_fields = [xd.SortKey(u'uri', document.uri)] 
    38         sort_fields += [xd.SortKey(k, v) for k, v in  
    39                         document.attributes.iteritems() 
    40                         if v is not None and k != 'uri'] 
    41         doc = xd.Document( 
    42                 textFields=xd.TextField(document.content), 
    43                 sortFields=sort_fields, 
    44                 uid=uri2id(document.uri), 
    45                 source=document.uri) 
     40        doc = xapian.Document() 
    4641 
    47         self.idx.index(doc) 
     42        # Xapian doesn't support UTF-8 yet. Coming soon. 
     43        content = document.content.encode('utf-8') 
     44        uri = document.uri.encode('utf-8') 
     45 
     46        doc.set_data(content) 
     47 
     48        doc.add_term('Q' + uri) 
     49 
     50        for word in self.words.finditer(content): 
     51            term = self.stemmer.stem_word(word.group().lower()) 
     52            doc.add_posting(term, word.start()) 
     53 
     54        self.db.replace_document('Q' + uri, doc) 
    4855 
    4956    def discard(self, document): 
     
    5158        if isinstance(document, Document): 
    5259            document = document.uri 
    53         self.idx.delete_document(uri2id(document)) 
     60        self.db.delete_document('Q' + document.encode('utf-8')) 
    5461 
    5562    def sync(self): 
    5663        if self.mode == READWRITE: 
    5764            self._assert_rw() 
    58             self.idx.flush() 
     65            self.db.flush() 
    5966            self._sync_source_state() 
    6067 
    6168    def close(self): 
    6269        self.sync() 
    63         self.idx.close() 
    64         self.idx = None 
     70        self.db = None 
    6571 
    6672    def search(self, phrase, flags=0, order_by=None, order_ascending=True, 
    6773               order_type=str): 
    68         phrase = phrase.encode('utf-8') 
    69         if order_by == 'relevance': 
    70             order_args = {'sortByRelevence': True} 
    71         else: 
    72             order_args = {'sortKey': order_by} 
    73         search = self.idx.search(phrase, sortAscending=order_ascending, 
    74                                  **order_args) 
    75         return XapianSearch(self, phrase, search) 
     74        terms = [self.stemmer.stem_word(term.lower()) 
     75                 for term in self.words.findall(phrase.encode('utf-8'))] 
     76        enquire = xapian.Enquire(self.db) 
     77        query = xapian.Query(xapian.Query.OP_AND, terms) 
     78        enquire.set_query(query) 
     79        return XapianSearch(self, phrase, enquire) 
    7680 
    7781 
    7882class XapianSearch(Search): 
    7983    def __iter__(self): 
    80         for hit in self.context: 
    81             doc = self.indexer.idx.get_document(hit['uid']) 
    82             # XXX Is this the actual way to get values out?!?!? 
    83             yield Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']), 
    84                       document=self.indexer.fetch) 
     84        matches = self.context.get_mset(0, 10) 
     85        print matches.get_matches_estimated() 
     86        for hit in matches: 
     87            doc = hit[xapian.MSET_DOCUMENT] 
     88            uri = None 
     89            # TODO Use skip_to('Q') when implemented (see #26 for more info) 
     90            for term in doc.termlist(): 
     91                if term[0][0] == 'Q': 
     92                    uri = term[0][1:] 
     93                    break 
     94            assert uri, 'uniQue term (URI) not found in document term list' 
     95            yield Hit(uri, document=self.indexer.fetch, 
     96                      did=hit[xapian.MSET_DID], 
     97                      score=float(hit[xapian.MSET_PERCENT]) / 100.0) 
    8598 
    8699    def __len__(self): 
    87100        return len(self.context) 
    88101 
    89     def __getitem__(self, index): 
    90         doc = self.indexer.idx.get_document(self.context[index]['uid']) 
    91         return Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']), 
    92                    document=self.indexer.fetch) 
     102#    def __getitem__(self, index): 
     103#        doc = self.indexer.idx.get_document(self.context[index]['uid']) 
     104#        return Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']), 
     105#                   document=self.indexer.fetch) 
  • pyndexter/trunk/setup.py

    r346 r354  
    11from setuptools import setup, Extension 
     2from Pyrex.Distutils import build_ext 
    23 
    34setup(name='pyndexter', 
     
    2223      extras_require={'hype': ['hype>=0.1'], 
    2324                      'Xapwrap': ['Xapwrap>=0.3']}, 
     25      ext_modules=[Extension('pyndexter.pyrex', ['pyndexter/pyrex.pyx'])], 
    2426      packages=['pyndexter']) 
  • pyndexter/trunk/.todo

    r332 r354  
    2424        HTTPSource should be able to handle multiple iterations, but self._traversed renders this impossible. 
    2525    </note> 
     26    <note priority="medium" time="1159011350"> 
     27        For storing state, perhaps there should be default store_state(store)/restore_state(store) methods. Also need a Store class, or just use a file object... 
     28    </note> 
    2629</todo>