Changeset 378
- Timestamp:
- 02/06/07 20:06:51 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/default.py (deleted)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/__init__.py (modified) (5 diffs)
- pyndexter/trunk/pyndexter/util.py (modified) (1 diff)
- pyndexter/trunk/.todo (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r377 r378 55 55 else: 56 56 return set() 57 58 def keys(self): 59 return self.db.keys() 57 60 58 61 … … 135 138 136 139 def discard(self, uri): 137 pass 140 try: 141 del self.attributes[uri] 142 except KeyError: 143 pass 144 145 uri_set = set([uri]) 146 for word in self.uris.get(uri): 147 self.uris.remove(word, uri_set) 148 self.uris.remove(uri) 149 150 replace = index 151 152 def __iter__(self): 153 return iter(self.uris.keys()) 138 154 139 155 def close(self): … … 172 188 attributes['uri'] = uri 173 189 attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()]) 174 return Document(**attributes)190 return Hit(document=self.indexer.framework.fetch, **attributes) pyndexter/trunk/pyndexter/indexers/xapian.py
r377 r378 60 60 self.db.replace_document('Q' + uri, doc) 61 61 62 def update(self, document): 63 return self.index(document) 62 replace = index 64 63 65 64 def discard(self, uri): pyndexter/trunk/pyndexter/__init__.py
r377 r378 521 521 raise NotImplementedError 522 522 523 def update(self, document):524 """ Update a document in the index. Default is to `discard()` and523 def replace(self, document): 524 """Replace a document in the index. Default is to `discard()` and 525 525 `index()`.""" 526 526 self.discard(document.uri) … … 723 723 self.discard(uri) 724 724 elif transition == MODIFIED: 725 self. update(uri)725 self.replace(uri) 726 726 else: 727 727 self.index(uri) … … 735 735 736 736 def index(self, document): 737 """ Index a single document, specified as either a Document object738 or a URI."""737 """Index a single document, specified as either a Document object or a 738 URI.""" 739 739 self._assert_rw() 740 740 if isinstance(document, basestring): … … 744 744 745 745 def discard(self, document): 746 """ Discard the specified document from the index, specified as either747 a Document object or a URI."""746 """Discard the specified document from the index, specified as either a 747 Document object or a URI.""" 748 748 self._assert_rw() 749 749 if isinstance(document, Document): … … 751 751 return self.indexer.discard(document) 752 752 753 def replace(self, document): 754 """Replace document in the index, specified as either a Document object 755 or a URI.""" 756 self._assert_rw() 757 if isinstance(document, basestring): 758 document = self.fetch(document) 759 return self.indexer.replace(document) 753 760 754 761 def search(self, query): pyndexter/trunk/pyndexter/util.py
r377 r378 70 70 return uri 71 71 72 def stem_text(words_re, stemmer, min_word_length=3, max_word_length=64): 73 """Stem all words in a document. 72 def reduce_text(text, words_re, stemmer=lambda w: w, min_word_length=3, 73 max_word_length=64, unique=False): 74 """Compact all words in a block of text. 74 75 75 76 `words_re` is a compiled re object, `stemmer` is a callable returning a 76 stemmed word.""" 77 stemmed word. 78 79 If `unique` is true, return a string of **unordered** words with duplicates 80 removed.""" 77 81 from StringIO import StringIO 78 out = StringIO() 79 for word in words_re.findall(document.content): 80 pass 82 if unique: 83 out = set() 84 def append(word): 85 out.add(word) 86 else: 87 out = [] 88 def append(word): 89 out.append(word) 90 for word in words_re.findall(text): 91 # Cull short and long words 92 if min_word_length > len(word) > max_word_length: 93 continue 94 append(stemmer(word)) 95 return u' '.join(out) 96 pyndexter/trunk/.todo
r377 r378 17 17 <note priority="medium" time="1145854608" done="1146296772"> 18 18 Finish MetaSource 19 </note>20 <note priority="medium" time="1146296806">21 Optimise on disk format for DefaultIndexer. Use URI/word "ids" rather than full word.22 19 </note> 23 20 <note priority="medium" time="1146321654"> … … 89 86 <note priority="medium" time="1170685227"> 90 87 Default indexer tasks 88 <note priority="medium" time="1146296806"> 89 Optimise on disk format for DefaultIndexer. Use URI/word "ids" rather than full word. 90 </note> 91 91 <note priority="medium" time="1170685251"> 92 92 Abstract storage mechanism so that sqlite, metakit, anydbm, etc. can be used. This would allow for wide use.
