Show
Ignore:
Timestamp:
08/21/07 07:19:30 (1 year ago)
Author:
athomas
Message:

pyndexter: moved excerpt into the util module.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • pyndexter/trunk/pyndexter/util.py

    r391 r450  
    2222 
    2323__all__ = """ 
    24 set frozenset 
    25 quote unquote 
    26 URI TimingFilter 
     24set frozenset quote unquote URI excerpt 
    2725""".split() 
    2826 
     
    165163 
    166164 
    167 class TimingFilter(object): 
    168     """A Framework filter for collecting timing statistics.""" 
    169     def __init__(self, next_filter=None, progressive=False): 
    170         """`next_filter` is the next filter in the chain. 
    171  
    172         `progressive` will print statistics while the indexer is running.""" 
    173         if next_filter: 
    174             self.next_filter = next_filter 
    175         self.times = [] 
    176         self.total = timedelta() 
    177         self.average = timedelta() 
    178         self.progressive = progressive 
    179  
    180     def next_filter(self, framework, context, stream): 
    181         for transition, uri in stream: 
    182             yield transition, uri 
    183  
    184     def __call__(self, framework, context, stream): 
    185         self.times = [] 
    186         for transition, uri in self.next_filter(framework, context, stream): 
    187             start = datetime.now() 
    188             yield transition, uri 
    189             end = datetime.now() 
    190             line = (transition, uri, start, end) 
    191             self.times.append(line) 
    192             if self.progressive: 
    193                 self.print_line(*line) 
    194  
    195         self.total = timedelta() 
    196         self.average = timedelta() 
    197         for transition, uri, start, end in self.times: 
    198             self.total += end - start 
    199         if self.total: 
    200             self.average = self.total / len(self.times) 
    201         if self.progressive: 
    202             self.print_summary() 
    203  
    204     def print_line(self, transition, uri, start, end, out=sys.stdout): 
    205         from pyndexter import MODIFIED, ADDED, REMOVED 
    206         mapping = {MODIFIED: 'MODIFIED', ADDED: 'ADDED', REMOVED: 'REMOVED'} 
    207         print >>out, '%s %s (in %s)' % (mapping[transition], uri, end - start) 
    208  
    209     def print_summary(self, out=sys.stdout): 
    210         print >>out 
    211         print >>out, "Indexed %i documents" % len(self.times) 
    212         print >>out, 'Total time to index: %s' % self.total 
    213         print >>out, 'Average time to index: %s' % self.average 
    214  
    215     def __str__(self): 
    216         from StringIO import StringIO 
    217         out = StringIO() 
    218         for transition, uri, start, end in self.times: 
    219             self.print_line(transition, uri, start, end, out=out) 
    220         self.print_summary(out) 
    221         return out.getvalue() 
     165def excerpt(text, terms, max_len=240, fuzz=60): 
     166    """Generate an excerpt of a Document. Attempts to include as many `terms` 
     167    as possible in the excerpt. 
     168    """ 
     169    # FIXME Take into account stemming 
     170    # FIXME Take into account whole-word only search, or 
     171    # wild-card...etc.??? Tricky. 
     172    text_low = text.lower() 
     173    beg = -1 
     174    for k in terms: 
     175        i = text_low.find(k.lower()) 
     176        if (i > -1 and i < beg) or beg == -1: 
     177            beg = i 
     178    excerpt_beg = 0 
     179    if beg > fuzz: 
     180        for sep in ('.', ':', ';', '='): 
     181            eb = text.find(sep, beg - fuzz, beg - 1) 
     182            if eb > -1: 
     183                eb += 1 
     184                break 
     185        else: 
     186            eb = beg - fuzz 
     187        excerpt_beg = eb 
     188    if excerpt_beg < 0: 
     189        excerpt_beg = 0 
     190    msg = text[excerpt_beg:beg+max_len] 
     191    if beg > fuzz: 
     192        msg = '... ' + msg 
     193    if beg < len(text)-max_len: 
     194        msg = msg + ' ...' 
     195    return msg