Changeset 450
- Timestamp:
- 08/21/07 07:19:30 (1 year ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (5 diffs)
- pyndexter/trunk/pyndexter/util.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r449 r450 175 175 # Do a low-compression gzip 176 176 buffer = StringIO() 177 gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb') 178 gz.write(document.content.encode('utf-8')) 179 gz.close() 177 try: 178 gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb') 179 gz.write(document.content.encode('utf-8', 'ignore')) 180 finally: 181 gz.close() 180 182 self.cachedb[uri] = buffer.getvalue() 181 183 else: … … 227 229 if self.compact: 228 230 gz = GzipFile(fileobj=StringIO(content), mode='rb') 229 content = gz.read().decode('utf-8' )231 content = gz.read().decode('utf-8', 'ignore') 230 232 quality = 0.99 231 233 else: pyndexter/trunk/pyndexter/__init__.py
r449 r450 67 67 68 68 Query Framework Document Indexer Result Hit PluginFactory URI 69 Excerpt70 69 """.split() 71 70 … … 139 138 __slots__ = ('attributes', '_content', 'source', 'quality') 140 139 141 def __init__(self, uri, content=None, source=None, changed=None, 142 quality=1.0, **attributes): 140 def __init__(self, uri, content=None, source=None, quality=1.0, 141 **attributes): 142 assert isinstance(content, unicode) 143 143 self._content = content 144 144 self.source = source 145 145 self.quality = quality 146 146 self.attributes = attributes 147 self.attributes.update({'uri': URI(uri) , 'changed': changed})147 self.attributes.update({'uri': URI(uri)}) 148 148 149 149 def __repr__(self): … … 827 827 """ Wrapper around a search hit. If `current` is a callable, it should 828 828 be a function that fetches the Document associated with `uri`, which is 829 passed as the only argument. """ 829 passed as the only argument. 830 """ 830 831 831 832 __slots__ = ('attributes', '_current', '_indexed') … … 844 845 return self.attributes.get(key, default) 845 846 846 def excerpt(self, terms, max_len=240, fuzz=60): 847 """Generate an Excerpt from this Hit.""" 847 def get_document(self): 848 """Fetch the `active` document, preferring to fetch a fresh document 849 from the source, but falling back on the indexed version. 850 """ 848 851 try: 849 current = True 850 doc = self.current 852 return self.current 851 853 except: 852 current = False 853 doc = self.indexed 854 return Excerpt(doc, terms, max_len, fuzz, current) 854 return self.indexed 855 document = property(get_document) 855 856 856 857 def __getattr__(self, key): … … 882 883 return self._indexed 883 884 indexed = property(_get_indexed) 884 885 886 class Excerpt(object):887 """Generate an excerpt of a Document.888 889 Has three useful attributes:890 891 ``current``892 Whether this is a current copy of the `Document` (as opposed to a893 historical version from the `Indexer`)894 895 ``quality``896 Quality of the text compared to the original, between 0.0 and 1.0.897 898 ``text``899 The excerpt text.900 901 """902 def __init__(self, doc, terms, max_len=240, fuzz=60, current=True):903 self.text = self._shorten(doc.content, terms, max_len, fuzz)904 self.quality = doc.quality905 self.current = current906 907 def _shorten(self, text, terms, max_len=240, fuzz=60):908 # FIXME Take into account stemming909 # FIXME Take into account whole-word only search, or910 # wild-card...etc.??? Tricky.911 text_low = text.lower()912 beg = -1913 for k in terms:914 i = text_low.find(k.lower())915 if (i > -1 and i < beg) or beg == -1:916 beg = i917 excerpt_beg = 0918 if beg > fuzz:919 for sep in ('.', ':', ';', '='):920 eb = text.find(sep, beg - fuzz, beg - 1)921 if eb > -1:922 eb += 1923 break924 else:925 eb = beg - fuzz926 excerpt_beg = eb927 if excerpt_beg < 0:928 excerpt_beg = 0929 msg = text[excerpt_beg:beg+max_len]930 if beg > fuzz:931 msg = '... ' + msg932 if beg < len(text)-max_len:933 msg = msg + ' ...'934 return msg935 936 def __repr__(self):937 return self.textpyndexter/trunk/pyndexter/util.py
r391 r450 22 22 23 23 __all__ = """ 24 set frozenset 25 quote unquote 26 URI TimingFilter 24 set frozenset quote unquote URI excerpt 27 25 """.split() 28 26 … … 165 163 166 164 167 class TimingFilter(object): 168 """A Framework filter for collecting timing statistics.""" 169 def __init__(self, next_filter=None, progressive=False): 170 """`next_filter` is the next filter in the chain. 171 172 `progressive` will print statistics while the indexer is running.""" 173 if next_filter: 174 self.next_filter = next_filter 175 self.times = [] 176 self.total = timedelta() 177 self.average = timedelta() 178 self.progressive = progressive 179 180 def next_filter(self, framework, context, stream): 181 for transition, uri in stream: 182 yield transition, uri 183 184 def __call__(self, framework, context, stream): 185 self.times = [] 186 for transition, uri in self.next_filter(framework, context, stream): 187 start = datetime.now() 188 yield transition, uri 189 end = datetime.now() 190 line = (transition, uri, start, end) 191 self.times.append(line) 192 if self.progressive: 193 self.print_line(*line) 194 195 self.total = timedelta() 196 self.average = timedelta() 197 for transition, uri, start, end in self.times: 198 self.total += end - start 199 if self.total: 200 self.average = self.total / len(self.times) 201 if self.progressive: 202 self.print_summary() 203 204 def print_line(self, transition, uri, start, end, out=sys.stdout): 205 from pyndexter import MODIFIED, ADDED, REMOVED 206 mapping = {MODIFIED: 'MODIFIED', ADDED: 'ADDED', REMOVED: 'REMOVED'} 207 print >>out, '%s %s (in %s)' % (mapping[transition], uri, end - start) 208 209 def print_summary(self, out=sys.stdout): 210 print >>out 211 print >>out, "Indexed %i documents" % len(self.times) 212 print >>out, 'Total time to index: %s' % self.total 213 print >>out, 'Average time to index: %s' % self.average 214 215 def __str__(self): 216 from StringIO import StringIO 217 out = StringIO() 218 for transition, uri, start, end in self.times: 219 self.print_line(transition, uri, start, end, out=out) 220 self.print_summary(out) 221 return out.getvalue() 165 def excerpt(text, terms, max_len=240, fuzz=60): 166 """Generate an excerpt of a Document. Attempts to include as many `terms` 167 as possible in the excerpt. 168 """ 169 # FIXME Take into account stemming 170 # FIXME Take into account whole-word only search, or 171 # wild-card...etc.??? Tricky. 172 text_low = text.lower() 173 beg = -1 174 for k in terms: 175 i = text_low.find(k.lower()) 176 if (i > -1 and i < beg) or beg == -1: 177 beg = i 178 excerpt_beg = 0 179 if beg > fuzz: 180 for sep in ('.', ':', ';', '='): 181 eb = text.find(sep, beg - fuzz, beg - 1) 182 if eb > -1: 183 eb += 1 184 break 185 else: 186 eb = beg - fuzz 187 excerpt_beg = eb 188 if excerpt_beg < 0: 189 excerpt_beg = 0 190 msg = text[excerpt_beg:beg+max_len] 191 if beg > fuzz: 192 msg = '... ' + msg 193 if beg < len(text)-max_len: 194 msg = msg + ' ...' 195 return msg
