root/pyndexter/trunk/pyndexter/util.py

Revision 450, 6.2 KB (checked in by athomas, 3 years ago)

pyndexter: moved excerpt into the util module.

Line 
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4#
5# This software is licensed as described in the file COPYING, which
6# you should have received as part of this distribution.
7#
8
9import re
10import posixpath
11import sys
12from StringIO import StringIO
13from urllib import quote, unquote
14from datetime import datetime, timedelta
15try:
16    set = set
17    frozenset = frozenset
18except:
19    from sets import Set as set
20    from sets import ImmutableSet as frozenset
21
22
23__all__ = """
24set frozenset quote unquote URI excerpt
25""".split()
26
27class URI(object):
28    """Parse a URI into its component parts. The `query` component is passed
29    through `cgi.parse_qs()`.
30
31        scheme://username:password@host/path?query#fragment
32
33    Each component is available as an attribute of the object.
34
35    TODO: Support "parameters???" Never seen this in the wild:
36        scheme://username:password@host/path;parameters?query#fragment
37
38    PS. `urlparse` is not useful.
39
40    The URI constructor can be passed a string:
41
42    >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment')
43    >>> u
44    URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
45    >>> u.scheme
46    'http'
47    >>> u.username
48    'user'
49    >>> u.password
50    'password'
51    >>> u.host
52    'www.example.com'
53    >>> u.path
54    '/some/path'
55    >>> u.query
56    {'parm': ['1', '2'], 'other': ['3']}
57    >>> u.fragment
58    'fragment'
59
60    ...or the individual URI components as keyword arguments:
61
62    >>> URI(scheme='http', username='user', password='password', host='www.example.com', path='/some/path', query={'parm': [1, 2], 'other': [3]}, fragment='fragment')
63    URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
64
65    ...or finally, another URI object:
66
67    >>> v = URI(u)
68    >>> v == u
69    True
70    >>> v.query is u.query
71    False
72    >>> v
73    URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
74
75    URI also normalises the path component:
76
77    >>> URI('http://www.example.com//some/../foo/path/')
78    URI(u'http://www.example.com/foo/path')
79    """
80
81    _pattern = re.compile(r'(?:(?P<scheme>[^:]+)://)?(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#:]*)(?::(P<port>[\d+]+))?(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?')
82
83    __slots__ = ('scheme', 'username', 'password', 'host', 'port', '_path',
84                 'query', 'fragment')
85
86    def __init__(self, uri=None, scheme='', username='', password='', host='',
87                 port='', path='', query={}, fragment=''):
88        self._path = ''
89        # Copy attributes of a URI object
90        if isinstance(uri, URI):
91            from copy import copy
92            self.scheme, self.username, self.password, self.host, self.port, \
93                self.path, self.query, self.fragment = \
94                    uri.scheme, uri.username, uri.password, uri.host, \
95                    uri.port, uri.path, copy(uri.query), uri.fragment
96        elif uri is not None:
97            # Parse URI string
98            from cgi import parse_qs
99
100            match = self._pattern.match(uri)
101            if match is None:
102                raise ValueError('Invalid URI')
103            groups = [g or '' for g in match.groups()]
104            groups = map(unquote, groups[0:6]) + \
105                     [parse_qs(groups[6] or '')] + \
106                     map(unquote, groups[7:])
107            self.scheme, self.username, self.password, self.host, self.port, \
108                self.path, self.query, self.fragment = groups
109        else:
110            # Explicitly provide URI components
111            self.scheme, self.username, self.password, self.host, self.port, \
112                self.path, self.query, self.fragment = scheme, username, \
113                    password, host, port, path, query, fragment
114
115    def _set_path(self, path):
116        """Return a normalised path.
117        """
118        if path:
119            self._path = '/' + posixpath.normpath(path).lstrip('/')
120        else:
121            self._path = ''
122
123    def _get_path(self):
124        return self._path
125
126    path = property(_get_path, _set_path)
127
128    def __cmp__(self, other):
129        """Compare two URI objects.
130
131        >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment')
132        >>> v = URI(u)
133        >>> u == v
134        True
135        >>> v.host = 'www.google.com'
136        >>> u == v
137        False
138        """
139        return cmp(repr(self), repr(other))
140
141    def __repr__(self):
142        return "URI(u'%s')" % unicode(self)
143
144    def __str__(self):
145        uri = unicode(self.scheme and (quote(self.scheme) + u'://') or u'')
146        if self.username or self.password:
147            if self.username:
148                uri += quote(self.username)
149            if self.password:
150                uri += u':' + quote(self.password)
151            uri += u'@'
152        uri += quote(self.host)
153        if self.port:
154            uri += u':%s' % port
155        uri += quote(self.path)
156        if self.query:
157            uri += u'?' + u'&'.join([u'&'.join([u'%s=%s' % (k, quote(str(v)))
158                                                for v in l])
159                                     for k, l in sorted(self.query.items())])
160        if self.fragment:
161            uri += u'#' + quote(self.fragment)
162        return uri
163
164
165def excerpt(text, terms, max_len=240, fuzz=60):
166    """Generate an excerpt of a Document. Attempts to include as many `terms`
167    as possible in the excerpt.
168    """
169    # FIXME Take into account stemming
170    # FIXME Take into account whole-word only search, or
171    # wild-card...etc.??? Tricky.
172    text_low = text.lower()
173    beg = -1
174    for k in terms:
175        i = text_low.find(k.lower())
176        if (i > -1 and i < beg) or beg == -1:
177            beg = i
178    excerpt_beg = 0
179    if beg > fuzz:
180        for sep in ('.', ':', ';', '='):
181            eb = text.find(sep, beg - fuzz, beg - 1)
182            if eb > -1:
183                eb += 1
184                break
185        else:
186            eb = beg - fuzz
187        excerpt_beg = eb
188    if excerpt_beg < 0:
189        excerpt_beg = 0
190    msg = text[excerpt_beg:beg+max_len]
191    if beg > fuzz:
192        msg = '... ' + msg
193    if beg < len(text)-max_len:
194        msg = msg + ' ...'
195    return msg
Note: See TracBrowser for help on using the browser.