| 167 | | class TimingFilter(object): |
|---|
| 168 | | """A Framework filter for collecting timing statistics.""" |
|---|
| 169 | | def __init__(self, next_filter=None, progressive=False): |
|---|
| 170 | | """`next_filter` is the next filter in the chain. |
|---|
| 171 | | |
|---|
| 172 | | `progressive` will print statistics while the indexer is running.""" |
|---|
| 173 | | if next_filter: |
|---|
| 174 | | self.next_filter = next_filter |
|---|
| 175 | | self.times = [] |
|---|
| 176 | | self.total = timedelta() |
|---|
| 177 | | self.average = timedelta() |
|---|
| 178 | | self.progressive = progressive |
|---|
| 179 | | |
|---|
| 180 | | def next_filter(self, framework, context, stream): |
|---|
| 181 | | for transition, uri in stream: |
|---|
| 182 | | yield transition, uri |
|---|
| 183 | | |
|---|
| 184 | | def __call__(self, framework, context, stream): |
|---|
| 185 | | self.times = [] |
|---|
| 186 | | for transition, uri in self.next_filter(framework, context, stream): |
|---|
| 187 | | start = datetime.now() |
|---|
| 188 | | yield transition, uri |
|---|
| 189 | | end = datetime.now() |
|---|
| 190 | | line = (transition, uri, start, end) |
|---|
| 191 | | self.times.append(line) |
|---|
| 192 | | if self.progressive: |
|---|
| 193 | | self.print_line(*line) |
|---|
| 194 | | |
|---|
| 195 | | self.total = timedelta() |
|---|
| 196 | | self.average = timedelta() |
|---|
| 197 | | for transition, uri, start, end in self.times: |
|---|
| 198 | | self.total += end - start |
|---|
| 199 | | if self.total: |
|---|
| 200 | | self.average = self.total / len(self.times) |
|---|
| 201 | | if self.progressive: |
|---|
| 202 | | self.print_summary() |
|---|
| 203 | | |
|---|
| 204 | | def print_line(self, transition, uri, start, end, out=sys.stdout): |
|---|
| 205 | | from pyndexter import MODIFIED, ADDED, REMOVED |
|---|
| 206 | | mapping = {MODIFIED: 'MODIFIED', ADDED: 'ADDED', REMOVED: 'REMOVED'} |
|---|
| 207 | | print >>out, '%s %s (in %s)' % (mapping[transition], uri, end - start) |
|---|
| 208 | | |
|---|
| 209 | | def print_summary(self, out=sys.stdout): |
|---|
| 210 | | print >>out |
|---|
| 211 | | print >>out, "Indexed %i documents" % len(self.times) |
|---|
| 212 | | print >>out, 'Total time to index: %s' % self.total |
|---|
| 213 | | print >>out, 'Average time to index: %s' % self.average |
|---|
| 214 | | |
|---|
| 215 | | def __str__(self): |
|---|
| 216 | | from StringIO import StringIO |
|---|
| 217 | | out = StringIO() |
|---|
| 218 | | for transition, uri, start, end in self.times: |
|---|
| 219 | | self.print_line(transition, uri, start, end, out=out) |
|---|
| 220 | | self.print_summary(out) |
|---|
| 221 | | return out.getvalue() |
|---|
| | 165 | def excerpt(text, terms, max_len=240, fuzz=60): |
|---|
| | 166 | """Generate an excerpt of a Document. Attempts to include as many `terms` |
|---|
| | 167 | as possible in the excerpt. |
|---|
| | 168 | """ |
|---|
| | 169 | # FIXME Take into account stemming |
|---|
| | 170 | # FIXME Take into account whole-word only search, or |
|---|
| | 171 | # wild-card...etc.??? Tricky. |
|---|
| | 172 | text_low = text.lower() |
|---|
| | 173 | beg = -1 |
|---|
| | 174 | for k in terms: |
|---|
| | 175 | i = text_low.find(k.lower()) |
|---|
| | 176 | if (i > -1 and i < beg) or beg == -1: |
|---|
| | 177 | beg = i |
|---|
| | 178 | excerpt_beg = 0 |
|---|
| | 179 | if beg > fuzz: |
|---|
| | 180 | for sep in ('.', ':', ';', '='): |
|---|
| | 181 | eb = text.find(sep, beg - fuzz, beg - 1) |
|---|
| | 182 | if eb > -1: |
|---|
| | 183 | eb += 1 |
|---|
| | 184 | break |
|---|
| | 185 | else: |
|---|
| | 186 | eb = beg - fuzz |
|---|
| | 187 | excerpt_beg = eb |
|---|
| | 188 | if excerpt_beg < 0: |
|---|
| | 189 | excerpt_beg = 0 |
|---|
| | 190 | msg = text[excerpt_beg:beg+max_len] |
|---|
| | 191 | if beg > fuzz: |
|---|
| | 192 | msg = '... ' + msg |
|---|
| | 193 | if beg < len(text)-max_len: |
|---|
| | 194 | msg = msg + ' ...' |
|---|
| | 195 | return msg |
|---|