Changeset 354
- Timestamp:
- 09/25/06 07:49:51 (2 years ago)
- Files:
-
- pyndexter/trunk/COPYING (added)
- pyndexter/trunk/pyndexter/default.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/file.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/hyperestraier.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers.py (added)
- pyndexter/trunk/pyndexter/__init__.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/lucene.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/metasource.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/sources.py (added)
- pyndexter/trunk/pyndexter/util.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/xapian.py (modified) (5 diffs)
- pyndexter/trunk/setup.py (modified) (2 diffs)
- pyndexter/trunk/.todo (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/default.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import re 2 10 import os pyndexter/trunk/pyndexter/file.py
r353 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import sys 2 10 import codecs pyndexter/trunk/pyndexter/hyperestraier.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import os 2 10 import hype … … 34 42 if isinstance(document, basestring): 35 43 document = self.fetch(document) 36 hdoc = hype.Document( )44 hdoc = hype.Document(document.uri) 37 45 for k, v in document.attributes.iteritems(): 38 hdoc['@' + k] = v 46 if k != 'uri': 47 hdoc['@' + k] = v 39 48 hdoc.add_text(document.content) 40 49 self.db.put_doc(hdoc) pyndexter/trunk/pyndexter/__init__.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import os 2 10 import pickle pyndexter/trunk/pyndexter/lucene.py
r322 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import PyLucene 2 10 from pyndexter import * pyndexter/trunk/pyndexter/metasource.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 from pyndexter import * 2 10 from urlparse import urlsplit pyndexter/trunk/pyndexter/util.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import time 2 10 from UserDict import DictMixin pyndexter/trunk/pyndexter/xapian.py
r345 r354 1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> 4 # 5 # This software is licensed as described in the file COPYING, which 6 # you should have received as part of this distribution. 7 # 8 1 9 import os 10 import re 2 11 from pyndexter import * 3 import xapwrap.index as xi 4 import xapwrap.document as xd 5 6 7 # XXX Is a numeric ID the only way to uniquely identify documents in Xapian? 8 # XXX This seems crazy, and prone to error. 9 def uri2id(uri): 10 return abs(hash(uri)) 11 12 13 xd.Document.registerFlattener(long, xd.flattenNumeric) 14 xd.Document.registerFlattener(float, xd.flattenNumeric) 12 xapian = __import__('xapian') 15 13 16 14 … … 20 18 CAP_INTERSECTION 21 19 22 def __init__(self, path, source=None, mode=READWRITE): 20 def __init__(self, path, source=None, mode=READWRITE, stemmer='english', 21 words=r'\w+'): 23 22 Indexer.__init__(self, source, mode, os.path.join(path, 'state.db')) 24 23 self.path = path … … 26 25 self.idx_path = os.path.join(path, 'xapian.db') 27 26 if mode == READWRITE: 28 self.idx = xi.SmartIndex(self.idx_path, True) 27 self.db = xapian.WritableDatabase(self.idx_path, 28 xapian.DB_CREATE_OR_OPEN) 29 29 else: 30 self.idx = xi.SmartReadOnlyIndex(self.idx_path) 30 self.db = xapian.Database(self.idx_path) 31 self.stemmer = xapian.Stem('english') 32 self.words = re.compile(words) 33 31 34 32 35 def index(self, document): … … 35 38 document = self.fetch(document) 36 39 37 sort_fields = [xd.SortKey(u'uri', document.uri)] 38 sort_fields += [xd.SortKey(k, v) for k, v in 39 document.attributes.iteritems() 40 if v is not None and k != 'uri'] 41 doc = xd.Document( 42 textFields=xd.TextField(document.content), 43 sortFields=sort_fields, 44 uid=uri2id(document.uri), 45 source=document.uri) 40 doc = xapian.Document() 46 41 47 self.idx.index(doc) 42 # Xapian doesn't support UTF-8 yet. Coming soon. 43 content = document.content.encode('utf-8') 44 uri = document.uri.encode('utf-8') 45 46 doc.set_data(content) 47 48 doc.add_term('Q' + uri) 49 50 for word in self.words.finditer(content): 51 term = self.stemmer.stem_word(word.group().lower()) 52 doc.add_posting(term, word.start()) 53 54 self.db.replace_document('Q' + uri, doc) 48 55 49 56 def discard(self, document): … … 51 58 if isinstance(document, Document): 52 59 document = document.uri 53 self. idx.delete_document(uri2id(document))60 self.db.delete_document('Q' + document.encode('utf-8')) 54 61 55 62 def sync(self): 56 63 if self.mode == READWRITE: 57 64 self._assert_rw() 58 self. idx.flush()65 self.db.flush() 59 66 self._sync_source_state() 60 67 61 68 def close(self): 62 69 self.sync() 63 self.idx.close() 64 self.idx = None 70 self.db = None 65 71 66 72 def search(self, phrase, flags=0, order_by=None, order_ascending=True, 67 73 order_type=str): 68 phrase = phrase.encode('utf-8') 69 if order_by == 'relevance': 70 order_args = {'sortByRelevence': True} 71 else: 72 order_args = {'sortKey': order_by} 73 search = self.idx.search(phrase, sortAscending=order_ascending, 74 **order_args) 75 return XapianSearch(self, phrase, search) 74 terms = [self.stemmer.stem_word(term.lower()) 75 for term in self.words.findall(phrase.encode('utf-8'))] 76 enquire = xapian.Enquire(self.db) 77 query = xapian.Query(xapian.Query.OP_AND, terms) 78 enquire.set_query(query) 79 return XapianSearch(self, phrase, enquire) 76 80 77 81 78 82 class XapianSearch(Search): 79 83 def __iter__(self): 80 for hit in self.context: 81 doc = self.indexer.idx.get_document(hit['uid']) 82 # XXX Is this the actual way to get values out?!?!? 83 yield Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']), 84 document=self.indexer.fetch) 84 matches = self.context.get_mset(0, 10) 85 print matches.get_matches_estimated() 86 for hit in matches: 87 doc = hit[xapian.MSET_DOCUMENT] 88 uri = None 89 # TODO Use skip_to('Q') when implemented (see #26 for more info) 90 for term in doc.termlist(): 91 if term[0][0] == 'Q': 92 uri = term[0][1:] 93 break 94 assert uri, 'uniQue term (URI) not found in document term list' 95 yield Hit(uri, document=self.indexer.fetch, 96 did=hit[xapian.MSET_DID], 97 score=float(hit[xapian.MSET_PERCENT]) / 100.0) 85 98 86 99 def __len__(self): 87 100 return len(self.context) 88 101 89 def __getitem__(self, index):90 doc = self.indexer.idx.get_document(self.context[index]['uid'])91 return Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']),92 document=self.indexer.fetch)102 # def __getitem__(self, index): 103 # doc = self.indexer.idx.get_document(self.context[index]['uid']) 104 # return Hit(doc.get_value(self.indexer.idx.indexValueMap['uri']), 105 # document=self.indexer.fetch) pyndexter/trunk/setup.py
r346 r354 1 1 from setuptools import setup, Extension 2 from Pyrex.Distutils import build_ext 2 3 3 4 setup(name='pyndexter', … … 22 23 extras_require={'hype': ['hype>=0.1'], 23 24 'Xapwrap': ['Xapwrap>=0.3']}, 25 ext_modules=[Extension('pyndexter.pyrex', ['pyndexter/pyrex.pyx'])], 24 26 packages=['pyndexter']) pyndexter/trunk/.todo
r332 r354 24 24 HTTPSource should be able to handle multiple iterations, but self._traversed renders this impossible. 25 25 </note> 26 <note priority="medium" time="1159011350"> 27 For storing state, perhaps there should be default store_state(store)/restore_state(store) methods. Also need a Store class, or just use a file object... 28 </note> 26 29 </todo>
