| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 |
# |
|---|
| 5 |
# This software is licensed as described in the file COPYING, which |
|---|
| 6 |
# you should have received as part of this distribution. |
|---|
| 7 |
# |
|---|
| 8 |
|
|---|
| 9 |
import re |
|---|
| 10 |
import posixpath |
|---|
| 11 |
import sys |
|---|
| 12 |
from StringIO import StringIO |
|---|
| 13 |
from urllib import quote, unquote |
|---|
| 14 |
from datetime import datetime, timedelta |
|---|
| 15 |
try: |
|---|
| 16 |
set = set |
|---|
| 17 |
frozenset = frozenset |
|---|
| 18 |
except: |
|---|
| 19 |
from sets import Set as set |
|---|
| 20 |
from sets import ImmutableSet as frozenset |
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
__all__ = """ |
|---|
| 24 |
set frozenset quote unquote URI excerpt |
|---|
| 25 |
""".split() |
|---|
| 26 |
|
|---|
| 27 |
class URI(object): |
|---|
| 28 |
"""Parse a URI into its component parts. The `query` component is passed |
|---|
| 29 |
through `cgi.parse_qs()`. |
|---|
| 30 |
|
|---|
| 31 |
scheme://username:password@host/path?query#fragment |
|---|
| 32 |
|
|---|
| 33 |
Each component is available as an attribute of the object. |
|---|
| 34 |
|
|---|
| 35 |
TODO: Support "parameters???" Never seen this in the wild: |
|---|
| 36 |
scheme://username:password@host/path;parameters?query#fragment |
|---|
| 37 |
|
|---|
| 38 |
PS. `urlparse` is not useful. |
|---|
| 39 |
|
|---|
| 40 |
The URI constructor can be passed a string: |
|---|
| 41 |
|
|---|
| 42 |
>>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment') |
|---|
| 43 |
>>> u |
|---|
| 44 |
URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 45 |
>>> u.scheme |
|---|
| 46 |
'http' |
|---|
| 47 |
>>> u.username |
|---|
| 48 |
'user' |
|---|
| 49 |
>>> u.password |
|---|
| 50 |
'password' |
|---|
| 51 |
>>> u.host |
|---|
| 52 |
'www.example.com' |
|---|
| 53 |
>>> u.path |
|---|
| 54 |
'/some/path' |
|---|
| 55 |
>>> u.query |
|---|
| 56 |
{'parm': ['1', '2'], 'other': ['3']} |
|---|
| 57 |
>>> u.fragment |
|---|
| 58 |
'fragment' |
|---|
| 59 |
|
|---|
| 60 |
...or the individual URI components as keyword arguments: |
|---|
| 61 |
|
|---|
| 62 |
>>> URI(scheme='http', username='user', password='password', host='www.example.com', path='/some/path', query={'parm': [1, 2], 'other': [3]}, fragment='fragment') |
|---|
| 63 |
URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 64 |
|
|---|
| 65 |
...or finally, another URI object: |
|---|
| 66 |
|
|---|
| 67 |
>>> v = URI(u) |
|---|
| 68 |
>>> v == u |
|---|
| 69 |
True |
|---|
| 70 |
>>> v.query is u.query |
|---|
| 71 |
False |
|---|
| 72 |
>>> v |
|---|
| 73 |
URI(u'http://user:password@www.example.com/some/path?other=3&parm=1&parm=2#fragment') |
|---|
| 74 |
|
|---|
| 75 |
URI also normalises the path component: |
|---|
| 76 |
|
|---|
| 77 |
>>> URI('http://www.example.com//some/../foo/path/') |
|---|
| 78 |
URI(u'http://www.example.com/foo/path') |
|---|
| 79 |
""" |
|---|
| 80 |
|
|---|
| 81 |
_pattern = re.compile(r'(?:(?P<scheme>[^:]+)://)?(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#:]*)(?::(P<port>[\d+]+))?(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?') |
|---|
| 82 |
|
|---|
| 83 |
__slots__ = ('scheme', 'username', 'password', 'host', 'port', '_path', |
|---|
| 84 |
'query', 'fragment') |
|---|
| 85 |
|
|---|
| 86 |
def __init__(self, uri=None, scheme='', username='', password='', host='', |
|---|
| 87 |
port='', path='', query={}, fragment=''): |
|---|
| 88 |
self._path = '' |
|---|
| 89 |
# Copy attributes of a URI object |
|---|
| 90 |
if isinstance(uri, URI): |
|---|
| 91 |
from copy import copy |
|---|
| 92 |
self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 93 |
self.path, self.query, self.fragment = \ |
|---|
| 94 |
uri.scheme, uri.username, uri.password, uri.host, \ |
|---|
| 95 |
uri.port, uri.path, copy(uri.query), uri.fragment |
|---|
| 96 |
elif uri is not None: |
|---|
| 97 |
# Parse URI string |
|---|
| 98 |
from cgi import parse_qs |
|---|
| 99 |
|
|---|
| 100 |
match = self._pattern.match(uri) |
|---|
| 101 |
if match is None: |
|---|
| 102 |
raise ValueError('Invalid URI') |
|---|
| 103 |
groups = [g or '' for g in match.groups()] |
|---|
| 104 |
groups = map(unquote, groups[0:6]) + \ |
|---|
| 105 |
[parse_qs(groups[6] or '')] + \ |
|---|
| 106 |
map(unquote, groups[7:]) |
|---|
| 107 |
self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 108 |
self.path, self.query, self.fragment = groups |
|---|
| 109 |
else: |
|---|
| 110 |
# Explicitly provide URI components |
|---|
| 111 |
self.scheme, self.username, self.password, self.host, self.port, \ |
|---|
| 112 |
self.path, self.query, self.fragment = scheme, username, \ |
|---|
| 113 |
password, host, port, path, query, fragment |
|---|
| 114 |
|
|---|
| 115 |
def _set_path(self, path): |
|---|
| 116 |
"""Return a normalised path. |
|---|
| 117 |
""" |
|---|
| 118 |
if path: |
|---|
| 119 |
self._path = '/' + posixpath.normpath(path).lstrip('/') |
|---|
| 120 |
else: |
|---|
| 121 |
self._path = '' |
|---|
| 122 |
|
|---|
| 123 |
def _get_path(self): |
|---|
| 124 |
return self._path |
|---|
| 125 |
|
|---|
| 126 |
path = property(_get_path, _set_path) |
|---|
| 127 |
|
|---|
| 128 |
def __cmp__(self, other): |
|---|
| 129 |
"""Compare two URI objects. |
|---|
| 130 |
|
|---|
| 131 |
>>> u = URI('http://user:password@www.example.com/some/path?parm=1&parm=2&other=3#fragment') |
|---|
| 132 |
>>> v = URI(u) |
|---|
| 133 |
>>> u == v |
|---|
| 134 |
True |
|---|
| 135 |
>>> v.host = 'www.google.com' |
|---|
| 136 |
>>> u == v |
|---|
| 137 |
False |
|---|
| 138 |
""" |
|---|
| 139 |
return cmp(repr(self), repr(other)) |
|---|
| 140 |
|
|---|
| 141 |
def __repr__(self): |
|---|
| 142 |
return "URI(u'%s')" % unicode(self) |
|---|
| 143 |
|
|---|
| 144 |
def __str__(self): |
|---|
| 145 |
uri = unicode(self.scheme and (quote(self.scheme) + u'://') or u'') |
|---|
| 146 |
if self.username or self.password: |
|---|
| 147 |
if self.username: |
|---|
| 148 |
uri += quote(self.username) |
|---|
| 149 |
if self.password: |
|---|
| 150 |
uri += u':' + quote(self.password) |
|---|
| 151 |
uri += u'@' |
|---|
| 152 |
uri += quote(self.host) |
|---|
| 153 |
if self.port: |
|---|
| 154 |
uri += u':%s' % port |
|---|
| 155 |
uri += quote(self.path) |
|---|
| 156 |
if self.query: |
|---|
| 157 |
uri += u'?' + u'&'.join([u'&'.join([u'%s=%s' % (k, quote(str(v))) |
|---|
| 158 |
for v in l]) |
|---|
| 159 |
for k, l in sorted(self.query.items())]) |
|---|
| 160 |
if self.fragment: |
|---|
| 161 |
uri += u'#' + quote(self.fragment) |
|---|
| 162 |
return uri |
|---|
| 163 |
|
|---|
| 164 |
|
|---|
| 165 |
def excerpt(text, terms, max_len=240, fuzz=60): |
|---|
| 166 |
"""Generate an excerpt of a Document. Attempts to include as many `terms` |
|---|
| 167 |
as possible in the excerpt. |
|---|
| 168 |
""" |
|---|
| 169 |
# FIXME Take into account stemming |
|---|
| 170 |
# FIXME Take into account whole-word only search, or |
|---|
| 171 |
# wild-card...etc.??? Tricky. |
|---|
| 172 |
text_low = text.lower() |
|---|
| 173 |
beg = -1 |
|---|
| 174 |
for k in terms: |
|---|
| 175 |
i = text_low.find(k.lower()) |
|---|
| 176 |
if (i > -1 and i < beg) or beg == -1: |
|---|
| 177 |
beg = i |
|---|
| 178 |
excerpt_beg = 0 |
|---|
| 179 |
if beg > fuzz: |
|---|
| 180 |
for sep in ('.', ':', ';', '='): |
|---|
| 181 |
eb = text.find(sep, beg - fuzz, beg - 1) |
|---|
| 182 |
if eb > -1: |
|---|
| 183 |
eb += 1 |
|---|
| 184 |
break |
|---|
| 185 |
else: |
|---|
| 186 |
eb = beg - fuzz |
|---|
| 187 |
excerpt_beg = eb |
|---|
| 188 |
if excerpt_beg < 0: |
|---|
| 189 |
excerpt_beg = 0 |
|---|
| 190 |
msg = text[excerpt_beg:beg+max_len] |
|---|
| 191 |
if beg > fuzz: |
|---|
| 192 |
msg = '... ' + msg |
|---|
| 193 |
if beg < len(text)-max_len: |
|---|
| 194 |
msg = msg + ' ...' |
|---|
| 195 |
return msg |
|---|