root/pyndexter/trunk/pyndexter/util.py

Revision 450, 6.2 kB (checked in by athomas, 1 year ago)

pyndexter: moved excerpt into the util module.

Line 
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4 #
5 # This software is licensed as described in the file COPYING, which
6 # you should have received as part of this distribution.
7 #
8
9 import re
10 import posixpath
11 import sys
12 from StringIO import StringIO
13 from urllib import quote, unquote
14 from datetime import datetime, timedelta
15 try:
16     set = set
17     frozenset = frozenset
18 except:
19     from sets import Set as set
20     from sets import ImmutableSet as frozenset
21
22
23 __all__ = """
24 set frozenset quote unquote URI excerpt
25 """.split()
26
27 class URI(object):
28     """Parse a URI into its component parts. The `query` component is passed
29     through `cgi.parse_qs()`.
30
31         scheme://username:password@host/path?query#fragment
32
33     Each component is available as an attribute of the object.
34
35     TODO: Support "parameters???" Never seen this in the wild:
36         scheme://username:password@host/path;parameters?query#fragment
37
38     PS. `urlparse` is not useful.
39
40     The URI constructor can be passed a string:
41
42     >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment')
43     >>> u
44     URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
45     >>> u.scheme
46     'http'
47     >>> u.username
48     'user'
49     >>> u.password
50     'password'
51     >>> u.host
52     'www.example.com'
53     >>> u.path
54     '/some/path'
55     >>> u.query
56     {'parm': ['1', '2'], 'other': ['3']}
57     >>> u.fragment
58     'fragment'
59
60     ...or the individual URI components as keyword arguments:
61
62     >>> URI(scheme='http', username='user', password='password', host='www.example.com', path='/some/path', query={'parm': [1, 2], 'other': [3]}, fragment='fragment')
63     URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
64
65     ...or finally, another URI object:
66
67     >>> v = URI(u)
68     >>> v == u
69     True
70     >>> v.query is u.query
71     False
72     >>> v
73     URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment')
74
75     URI also normalises the path component:
76
77     >>> URI('http://www.example.com//some/../foo/path/')
78     URI(u'http://www.example.com/foo/path')
79     """
80
81     _pattern = re.compile(r'(?:(?P<scheme>[^:]+)://)?(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#:]*)(?::(P<port>[\d+]+))?(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?')
82
83     __slots__ = ('scheme', 'username', 'password', 'host', 'port', '_path',
84                  'query', 'fragment')
85
86     def __init__(self, uri=None, scheme='', username='', password='', host='',
87                  port='', path='', query={}, fragment=''):
88         self._path = ''
89         # Copy attributes of a URI object
90         if isinstance(uri, URI):
91             from copy import copy
92             self.scheme, self.username, self.password, self.host, self.port, \
93                 self.path, self.query, self.fragment = \
94                     uri.scheme, uri.username, uri.password, uri.host, \
95                     uri.port, uri.path, copy(uri.query), uri.fragment
96         elif uri is not None:
97             # Parse URI string
98             from cgi import parse_qs
99
100             match = self._pattern.match(uri)
101             if match is None:
102                 raise ValueError('Invalid URI')
103             groups = [g or '' for g in match.groups()]
104             groups = map(unquote, groups[0:6]) + \
105                      [parse_qs(groups[6] or '')] + \
106                      map(unquote, groups[7:])
107             self.scheme, self.username, self.password, self.host, self.port, \
108                 self.path, self.query, self.fragment = groups
109         else:
110             # Explicitly provide URI components
111             self.scheme, self.username, self.password, self.host, self.port, \
112                 self.path, self.query, self.fragment = scheme, username, \
113                     password, host, port, path, query, fragment
114
115     def _set_path(self, path):
116         """Return a normalised path.
117         """
118         if path:
119             self._path = '/' + posixpath.normpath(path).lstrip('/')
120         else:
121             self._path = ''
122
123     def _get_path(self):
124         return self._path
125
126     path = property(_get_path, _set_path)
127
128     def __cmp__(self, other):
129         """Compare two URI objects.
130
131         >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment')
132         >>> v = URI(u)
133         >>> u == v
134         True
135         >>> v.host = 'www.google.com'
136         >>> u == v
137         False
138         """
139         return cmp(repr(self), repr(other))
140
141     def __repr__(self):
142         return "URI(u'%s')" % unicode(self)
143
144     def __str__(self):
145         uri = unicode(self.scheme and (quote(self.scheme) + u'://') or u'')
146         if self.username or self.password:
147             if self.username:
148                 uri += quote(self.username)
149             if self.password:
150                 uri += u':' + quote(self.password)
151             uri += u'@'
152         uri += quote(self.host)
153         if self.port:
154             uri += u':%s' % port
155         uri += quote(self.path)
156         if self.query:
157             uri += u'?' + u'&'.join([u'&'.join([u'%s=%s' % (k, quote(str(v)))
158                                                 for v in l])
159                                      for k, l in sorted(self.query.items())])
160         if self.fragment:
161             uri += u'#' + quote(self.fragment)
162         return uri
163
164
165 def excerpt(text, terms, max_len=240, fuzz=60):
166     """Generate an excerpt of a Document. Attempts to include as many `terms`
167     as possible in the excerpt.
168     """
169     # FIXME Take into account stemming
170     # FIXME Take into account whole-word only search, or
171     # wild-card...etc.??? Tricky.
172     text_low = text.lower()
173     beg = -1
174     for k in terms:
175         i = text_low.find(k.lower())
176         if (i > -1 and i < beg) or beg == -1:
177             beg = i
178     excerpt_beg = 0
179     if beg > fuzz:
180         for sep in ('.', ':', ';', '='):
181             eb = text.find(sep, beg - fuzz, beg - 1)
182             if eb > -1:
183                 eb += 1
184                 break
185         else:
186             eb = beg - fuzz
187         excerpt_beg = eb
188     if excerpt_beg < 0:
189         excerpt_beg = 0
190     msg = text[excerpt_beg:beg+max_len]
191     if beg > fuzz:
192         msg = '... ' + msg
193     if beg < len(text)-max_len:
194         msg = msg + ' ...'
195     return msg
Note: See TracBrowser for help on using the browser.