| 1 | # -*- coding: utf-8 -*- |
|---|
| 2 | # |
|---|
| 3 | # Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 | # |
|---|
| 5 | # This software is licensed as described in the file COPYING, which |
|---|
| 6 | # you should have received as part of this distribution. |
|---|
| 7 | # |
|---|
| 8 | |
|---|
| 9 | import re |
|---|
| 10 | import posixpath |
|---|
| 11 | import sys |
|---|
| 12 | from StringIO import StringIO |
|---|
| 13 | from urllib import quote, unquote |
|---|
| 14 | from datetime import datetime, timedelta |
|---|
| 15 | try: |
|---|
| 16 | set = set |
|---|
| 17 | frozenset = frozenset |
|---|
| 18 | except: |
|---|
| 19 | from sets import Set as set |
|---|
| 20 | from sets import ImmutableSet as frozenset |
|---|
| 21 | |
|---|
| 22 | |
|---|
| 23 | __all__ = """ |
|---|
| 24 | set frozenset quote unquote URI excerpt |
|---|
| 25 | """.split() |
|---|
| 26 | |
|---|
| 27 | class URI(object): |
|---|
| 28 | """Parse a URI into its component parts. The `query` component is passed |
|---|
| 29 | through `cgi.parse_qs()`. |
|---|
| 30 | |
|---|
| 31 | scheme://username:password@host/path?query#fragment |
|---|
| 32 | |
|---|
| 33 | Each component is available as an attribute of the object. |
|---|
| 34 | |
|---|
| 35 | TODO: Support "parameters???" Never seen this in the wild: |
|---|
| 36 | scheme://username:password@host/path;parameters?query#fragment |
|---|
| 37 | |
|---|
| 38 | PS. `urlparse` is not useful. |
|---|
| 39 | |
|---|
| 40 | The URI constructor can be passed a string: |
|---|
| 41 | |
|---|
| 42 | >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment') |
|---|
| 43 | >>> u |
|---|
| 44 | URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 45 | >>> u.scheme |
|---|
| 46 | 'http' |
|---|
| 47 | >>> u.username |
|---|
| 48 | 'user' |
|---|
| 49 | >>> u.password |
|---|
| 50 | 'password' |
|---|
| 51 | >>> u.host |
|---|
| 52 | 'www.example.com' |
|---|
| 53 | >>> u.path |
|---|
| 54 | '/some/path' |
|---|
| 55 | >>> u.query |
|---|
| 56 | {'parm': ['1', '2'], 'other': ['3']} |
|---|
| 57 | >>> u.fragment |
|---|
| 58 | 'fragment' |
|---|
| 59 | |
|---|
| 60 | ...or the individual URI components as keyword arguments: |
|---|
| 61 | |
|---|
| 62 | >>> URI(scheme='http', username='user', password='password', host='www.example.com', path='/some/path', query={'parm': [1, 2], 'other': [3]}, fragment='fragment') |
|---|
| 63 | URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 64 | |
|---|
| 65 | ...or finally, another URI object: |
|---|
| 66 | |
|---|
| 67 | >>> v = URI(u) |
|---|
| 68 | >>> v == u |
|---|
| 69 | True |
|---|
| 70 | >>> v.query is u.query |
|---|
| 71 | False |
|---|
| 72 | >>> v |
|---|
| 73 | URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 74 | |
|---|
| 75 | URI also normalises the path component: |
|---|
| 76 | |
|---|
| 77 | >>> URI('http://www.example.com//some/../foo/path/') |
|---|
| 78 | URI(u'http://www.example.com/foo/path') |
|---|
| 79 | """ |
|---|
| 80 | |
|---|
| 81 | _pattern = re.compile(r'(?:(?P<scheme>[^:]+)://)?(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#:]*)(?::(P<port>[\d+]+))?(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?') |
|---|
| 82 | |
|---|
| 83 | __slots__ = ('scheme', 'username', 'password', 'host', 'port', '_path', |
|---|
| 84 | 'query', 'fragment') |
|---|
| 85 | |
|---|
| 86 | def __init__(self, uri=None, scheme='', username='', password='', host='', |
|---|
| 87 | port='', path='', query={}, fragment=''): |
|---|
| 88 | self._path = '' |
|---|
| 89 | # Copy attributes of a URI object |
|---|
| 90 | if isinstance(uri, URI): |
|---|
| 91 | from copy import copy |
|---|
| 92 | self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 93 | self.path, self.query, self.fragment = \ |
|---|
| 94 | uri.scheme, uri.username, uri.password, uri.host, \ |
|---|
| 95 | uri.port, uri.path, copy(uri.query), uri.fragment |
|---|
| 96 | elif uri is not None: |
|---|
| 97 | # Parse URI string |
|---|
| 98 | from cgi import parse_qs |
|---|
| 99 | |
|---|
| 100 | match = self._pattern.match(uri) |
|---|
| 101 | if match is None: |
|---|
| 102 | raise ValueError('Invalid URI') |
|---|
| 103 | groups = [g or '' for g in match.groups()] |
|---|
| 104 | groups = map(unquote, groups[0:6]) + \ |
|---|
| 105 | [parse_qs(groups[6] or '')] + \ |
|---|
| 106 | map(unquote, groups[7:]) |
|---|
| 107 | self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 108 | self.path, self.query, self.fragment = groups |
|---|
| 109 | else: |
|---|
| 110 | # Explicitly provide URI components |
|---|
| 111 | self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 112 | self.path, self.query, self.fragment = scheme, username, \ |
|---|
| 113 | password, host, port, path, query, fragment |
|---|
| 114 | |
|---|
| 115 | def _set_path(self, path): |
|---|
| 116 | """Return a normalised path. |
|---|
| 117 | """ |
|---|
| 118 | if path: |
|---|
| 119 | self._path = '/' + posixpath.normpath(path).lstrip('/') |
|---|
| 120 | else: |
|---|
| 121 | self._path = '' |
|---|
| 122 | |
|---|
| 123 | def _get_path(self): |
|---|
| 124 | return self._path |
|---|
| 125 | |
|---|
| 126 | path = property(_get_path, _set_path) |
|---|
| 127 | |
|---|
| 128 | def __cmp__(self, other): |
|---|
| 129 | """Compare two URI objects. |
|---|
| 130 | |
|---|
| 131 | >>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment') |
|---|
| 132 | >>> v = URI(u) |
|---|
| 133 | >>> u == v |
|---|
| 134 | True |
|---|
| 135 | >>> v.host = 'www.google.com' |
|---|
| 136 | >>> u == v |
|---|
| 137 | False |
|---|
| 138 | """ |
|---|
| 139 | return cmp(repr(self), repr(other)) |
|---|
| 140 | |
|---|
| 141 | def __repr__(self): |
|---|
| 142 | return "URI(u'%s')" % unicode(self) |
|---|
| 143 | |
|---|
| 144 | def __str__(self): |
|---|
| 145 | uri = unicode(self.scheme and (quote(self.scheme) + u'://') or u'') |
|---|
| 146 | if self.username or self.password: |
|---|
| 147 | if self.username: |
|---|
| 148 | uri += quote(self.username) |
|---|
| 149 | if self.password: |
|---|
| 150 | uri += u':' + quote(self.password) |
|---|
| 151 | uri += u'@' |
|---|
| 152 | uri += quote(self.host) |
|---|
| 153 | if self.port: |
|---|
| 154 | uri += u':%s' % port |
|---|
| 155 | uri += quote(self.path) |
|---|
| 156 | if self.query: |
|---|
| 157 | uri += u'?' + u'&'.join([u'&'.join([u'%s=%s' % (k, quote(str(v))) |
|---|
| 158 | for v in l]) |
|---|
| 159 | for k, l in sorted(self.query.items())]) |
|---|
| 160 | if self.fragment: |
|---|
| 161 | uri += u'#' + quote(self.fragment) |
|---|
| 162 | return uri |
|---|
| 163 | |
|---|
| 164 | |
|---|
| 165 | def excerpt(text, terms, max_len=240, fuzz=60): |
|---|
| 166 | """Generate an excerpt of a Document. Attempts to include as many `terms` |
|---|
| 167 | as possible in the excerpt. |
|---|
| 168 | """ |
|---|
| 169 | # FIXME Take into account stemming |
|---|
| 170 | # FIXME Take into account whole-word only search, or |
|---|
| 171 | # wild-card...etc.??? Tricky. |
|---|
| 172 | text_low = text.lower() |
|---|
| 173 | beg = -1 |
|---|
| 174 | for k in terms: |
|---|
| 175 | i = text_low.find(k.lower()) |
|---|
| 176 | if (i > -1 and i < beg) or beg == -1: |
|---|
| 177 | beg = i |
|---|
| 178 | excerpt_beg = 0 |
|---|
| 179 | if beg > fuzz: |
|---|
| 180 | for sep in ('.', ':', ';', '='): |
|---|
| 181 | eb = text.find(sep, beg - fuzz, beg - 1) |
|---|
| 182 | if eb > -1: |
|---|
| 183 | eb += 1 |
|---|
| 184 | break |
|---|
| 185 | else: |
|---|
| 186 | eb = beg - fuzz |
|---|
| 187 | excerpt_beg = eb |
|---|
| 188 | if excerpt_beg < 0: |
|---|
| 189 | excerpt_beg = 0 |
|---|
| 190 | msg = text[excerpt_beg:beg+max_len] |
|---|
| 191 | if beg > fuzz: |
|---|
| 192 | msg = '... ' + msg |
|---|
| 193 | if beg < len(text)-max_len: |
|---|
| 194 | msg = msg + ' ...' |
|---|
| 195 | return msg |
|---|