Changeset 379
- Timestamp:
- 02/08/07 00:20:45 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (8 diffs)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (10 diffs)
- pyndexter/trunk/pyndexter/sources/file.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/util.py (modified) (4 diffs)
- pyndexter/trunk/.todo (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r378 r379 23 23 24 24 class KeyedSet(object): 25 def __init__(self, file, mode='c'):26 self.db = anydbm.open(file, mode)25 def __init__(self, db): 26 self.db = db 27 27 28 28 def update(self, key, values): 29 29 key = key.encode('utf-8') 30 if key in self.db:30 try: 31 31 v = pickle.loads(self.db[key]) 32 e lse:32 except KeyError: 33 33 v = set() 34 34 v.update(values) … … 37 37 def remove(self, key, values=None): 38 38 key = key.encode('utf-8') 39 if key in self.db:40 if values is None:39 if values is None: 40 try: 41 41 del self.db[key] 42 else: 42 except KeyError: 43 pass 44 else: 45 try: 43 46 v = pickle.loads(self.db[key]) 44 47 v.remove(values) 45 48 self.db[key] = pickle.dumps(v, 2) 49 except KeyError: 50 pass 46 51 47 52 def replace(self, key, values): … … 51 56 def get(self, key): 52 57 key = key.encode('utf-8') 53 if key in self.db:58 try: 54 59 return pickle.loads(self.db[key]) 55 e lse:60 except KeyError: 56 61 return set() 57 62 … … 62 67 class PickleDict(DictMixin): 63 68 """A dictionary wrapper that automatically pickles keys and values.""" 64 def __init__(self, file, mode='c'):65 self.db = anydbm.open(file, mode)69 def __init__(self, db): 70 self.db = db 66 71 67 72 def __getitem__(self, key): … … 81 86 """Constructor URI is: 82 87 83 builtin://<path>/?words=<regex>& max_word_length=<int>&backend=<uri>88 builtin://<path>/?words=<regex>&dbm=<dbm> 84 89 85 90 eg. 86 91 87 builtin:///tmp/builtin.idx?backend=mysql://localhost/database 92 builtin:///tmp/builtin.idx?dbm=gdbm 93 94 Supported dbm's are `anydbm`, `dbhash`, `gdbm` and `dbm` (Python 2.5). 95 `anydbm` is the default. 88 96 89 97 """ 90 def __init__(self, framework, path, words=r'\w+', max_word_length=32):98 def __init__(self, framework, path, dbm='anydbm'): 91 99 Indexer.__init__(self, framework) 92 93 self.words_re = re.compile(words, re.UNICODE)94 self.max_word_length = max_word_length95 100 96 101 self.path = path 97 102 self.state_path = os.path.join(path, 'store.db') 98 103 self.db_path = os.path.join(path, 'builtin.db') 104 105 framework.reduce.split = True 106 framework.reduce.unique = True 107 108 dbm = __import__(dbm, {}, {}, ['']) 99 109 100 110 if framework.mode == READWRITE: … … 106 116 107 117 # word:set(uri) 108 self.words = KeyedSet( os.path.join(self.db_path, 'words'), mode)118 self.words = KeyedSet(dbm.open(os.path.join(self.db_path, 'words'), mode)) 109 119 # uri:set(word) 110 self.uris = KeyedSet( os.path.join(self.db_path, 'uris'), mode)120 self.uris = KeyedSet(dbm.open(os.path.join(self.db_path, 'uris'), mode)) 111 121 # attribute:dict(attributes) 112 self.attributes = PickleDict(os.path.join(self.db_path, 'attributes'), 113 mode) 114 122 self.attributes = PickleDict(dbm.open(os.path.join(self.db_path, 123 'attributes'), mode)) 115 124 116 125 def index(self, document): 117 126 self.attributes[document.uri] = document.attributes 118 127 119 words = set([self.framework.stemmer(w.lower()) for w in 120 set(self.words_re.findall(document.content)) 121 if len(w) < self.max_word_length]) 122 128 words = self.framework.reduce(document.content) 123 129 doc_set = set([document.uri]) 124 130 … … 160 166 def search(self, query): 161 167 # FIXME currently simply finding the intersection of all documents (AND) 162 words = [self.framework.stemmer(w.lower()) for w in 163 query.as_string(and_=' ', or_=' ', not_=' ').split()] 164 168 query.reduce(self.framework.reduce) 169 words = query.as_string(and_=' ', or_=' ', not_=' ').split() 165 170 uris = None 166 171 for word in words: … … 173 178 174 179 175 indexer_factory = PluginFactory(BuiltinIndexer , max_word_length=int)180 indexer_factory = PluginFactory(BuiltinIndexer) 176 181 177 182 pyndexter/trunk/pyndexter/indexers/xapian.py
r378 r379 21 21 22 22 class XapianIndexer(Indexer): 23 def __init__(self, framework, path , words=r'\w+', max_word_length=240):23 def __init__(self, framework, path): 24 24 Indexer.__init__(self, framework) 25 self.words = re.compile(words) 26 self.max_word_length = max_word_length25 26 framework.reduce.split = True 27 27 28 28 path = path.encode('utf-8') … … 42 42 doc = xapian.Document() 43 43 44 # Xapian doesn't support UTF-8 yet. Coming soon.44 # FIXME Xapian doesn't support UTF-8 yet. "Coming soon." 45 45 content = document.content.encode('utf-8') 46 46 uri = document.uri.encode('utf-8') … … 50 50 doc.add_term('Q' + uri) 51 51 52 words = [self.framework.stemmer(w.lower()) 53 for w in set(self.words.findall(content))] 54 for word in self.words.finditer(content): 55 term = self.framework.stemmer(word.group().lower()) 56 if len(term) > self.max_word_length: 57 continue 58 doc.add_posting(term, word.start()) 52 words = self.framework.reduce(content) 53 for word in words: 54 doc.add_posting(word, 0) 59 55 60 56 self.db.replace_document('Q' + uri, doc) … … 79 75 class StemmerWrapper(xapian.Stem): 80 76 def stem_word(self, word): 81 return framework. stemmer(word)77 return framework.reduce.stemmer(word) 82 78 83 79 query_parser = xapian.QueryParser() 84 query_parser.set_stemmer(StemmerWrapper('english'))85 80 query = query_parser.parse_query(query.as_string().encode('utf-8').lower()) 86 81 enquire = xapian.Enquire(self.db) pyndexter/trunk/pyndexter/__init__.py
r378 r379 41 41 from StringIO import StringIO 42 42 from urlparse import urlsplit, urlunsplit 43 from pyndexter.util import set 43 from pyndexter.util import set, URI 44 44 45 45 … … 58 58 READONLY READWRITE 59 59 60 Query Framework Document Source Indexer Result StateStore Hit PluginFactory 60 Query Framework Document Source Indexer Result StateStore Hit PluginFactory URI 61 61 """.split() 62 62 … … 177 177 `difference()` assume that `_state` will contain a dictionary of 178 178 uri:modification-time mappings. 179 180 All URI's passed to and from Source objects must be `URI` objects. 179 181 180 182 (All attributes, including document contents and URI's must be in unicode) … … 446 448 return _convert(self) 447 449 450 def reduce(self, reduce): 451 """Pass each TERM node through `Reducer`.""" 452 def _reduce(node): 453 if not node: 454 return 455 if node.type == node.TERM: 456 node.value = reduce(node.value) 457 _reduce(node.left) 458 _reduce(node.right) 459 _reduce(self) 460 448 461 # Internal methods 449 462 def _tokenise(self, phrase): … … 469 482 470 483 484 class Reducer(object): 485 """Compact all words in a block of text.""" 486 487 def __init__(self, words_re=re.compile(r'\w+'), stemmer=lambda w: w, 488 min_word_length=3, max_word_length=64, unique=False, 489 split=False, lower=True): 490 """`words_re` is a regular expression object or string. 491 492 `stemmer` is a callable that stems a single word. 493 494 If `unique` is true, return a string of **unordered** words with 495 duplicates removed. 496 497 If `split` is true, return words in a collection rather than joining 498 them into a single string. 499 500 If `lower` is true, lowercase text.""" 501 502 if isinstance(words_re, basestring): 503 words_re = re.compile(words_re, re.UNICODE) 504 self.words_re = words_re 505 self.stemmer = stemmer 506 self.min_word_length = min_word_length 507 self.max_word_length = max_word_length 508 self.unique = unique 509 self.split = split 510 self.lower = lower 511 512 def __call__(self, text, unique=None, split=None): 513 if unique is None: 514 unique = self.unique 515 516 if unique: 517 out = set() 518 def append(word): 519 out.add(word) 520 else: 521 out = [] 522 def append(word): 523 out.append(word) 524 525 min = self.min_word_length 526 max = self.max_word_length 527 stemmer = self.stemmer 528 529 if self.lower: 530 text = text.lower() 531 532 words = self.words_re.findall(text) 533 if unique: 534 words = set(words) 535 536 for word in words: 537 if min > len(word) > max: 538 continue 539 append(stemmer(word)) 540 541 if split is None: 542 split = self.split 543 if split: 544 return out 545 return u' '.join(out) 546 547 471 548 class StateStore(object): 472 549 """A class providing file-like objects for storage and retrieval of … … 541 618 return None 542 619 620 543 621 class PluginFactory(object): 544 622 """Factory for translating URL-style query parameters into a standard 545 moduleconstructor call.623 plugin constructor call. 546 624 547 625 >>> class C: … … 645 723 646 724 If the `Indexer` is not capable of storing state and automatic updates are 647 desired, a `StateStore` object should be passed to the `Framework`. 648 649 `indexer` is a URI used to construct an indexer, or an `Indexer` object. 650 651 `stemmer` is a callable that stems individual words. Indexers can 652 optionally use this, though some may have their own stemming mechanisms, 653 typically passed as a URI parameter.""" 654 655 def __init__(self, indexer, sources=[], mode=READWRITE, state_store=None, 725 desired, a `StateStore` object should be passed to the `Framework`.""" 726 727 def __init__(self, indexer, mode=READWRITE, state_store=None, reduce=None, 656 728 stemmer=None): 729 """`indexer` is a URI used to construct an indexer, or an `Indexer` 730 object. 731 732 `reduce` is a `Reducer` object.If `reduce` is not specified, a default 733 `Reduce` object will be instantiated using `stemmer` (URI or callable) 734 as defaults. '''NOTE:''' Use of the reducer is optional - some 735 indexers may implement stemming and reduction internally.""" 657 736 self.mode = mode 737 738 if reduce is None: 739 if stemmer is None: 740 stemmer = lambda word: word 741 elif isinstance(stemmer, basestring): 742 Stemmer = self._load_plugin('stemmer', stemmer) 743 stemmer = Stemmer(uri=stemmer) 744 self.reduce = Reducer(stemmer=stemmer) 745 else: 746 self.reduce = reduce 658 747 659 748 if isinstance(indexer, basestring): … … 663 752 self.indexer = indexer 664 753 665 if stemmer is None:666 self.stemmer = lambda word: word667 elif isinstance(stemmer, basestring):668 self.stemmer = self._load_plugin('stemmer', stemmer)669 self.stemmer = self.stemmer(uri=stemmer)670 else:671 self.stemmer = stemmer672 673 754 if state_store is None: 674 755 self.state_store = self.indexer.state_store() … … 676 757 self.state_store = state_store 677 758 678 sources = [self._load_plugin('source', source)(framework=self, uri=source)679 for source in sources]680 681 759 from pyndexter.sources.metasource import MetaSource 682 760 self.source = MetaSource(self) 683 for source in sources:684 self.add_source(source)685 761 686 762 def add_source(self, source): … … 847 923 self._document = document 848 924 self.attributes = attributes 925 if isinstance(uri, basestring): 926 from pyndexter.util import URI 927 uri = URI(uri) 849 928 self.attributes['uri'] = uri 850 929 pyndexter/trunk/pyndexter/sources/file.py
r376 r379 23 23 from stat import * 24 24 from urlparse import urlsplit, urlunsplit 25 from urllib import quote, unquote 25 26 from pyndexter import * 26 27 … … 57 58 def matches(self, uri): 58 59 scheme, netloc, path, query, fragment = urlsplit(uri, 'file') 59 path = os.path.normpath( path)60 path = os.path.normpath(unquote(path)) 60 61 return scheme == 'file' and \ 61 62 path.startswith(self.path) and \ … … 86 87 87 88 def _file2uri(self, file): 88 return urlunsplit(('file', '', file, '', ''))89 return urlunsplit(('file', '', quote(file), '', '')) 89 90 90 91 def _uri2file(self, uri): … … 93 94 raise InvalidURI("URI scheme in '%s' not supported by FileSource" 94 95 % scheme) 95 path = os.path.normpath( path)96 path = os.path.normpath(unquote(path)) 96 97 if not path.startswith(self.path): 97 98 raise InvalidURI("Requested URI '%s' is not from this FileSource" pyndexter/trunk/pyndexter/util.py
r378 r379 8 8 9 9 import re 10 import posixpath 11 from StringIO import StringIO 12 from urllib import quote, unquote 10 13 try: 11 14 set = set 15 frozenset = frozenset 12 16 except: 13 17 from sets import Set as set 14 18 from sets import ImmutableSet as frozenset 15 19 20 21 __all__ = """ 22 set frozenset 23 quote unquote 24 URI 25 """.split() 16 26 17 27 class URI(object): … … 28 38 PS. `urlparse` is not useful. """ 29 39 30 _pattern = re.compile(r'(? P<scheme>[^:]+)://(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#]*)(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?')40 _pattern = re.compile(r'(?:(?P<scheme>[^:]+)://)?(?:(?P<username>[^:@]*)(?::(?P<password>[^@]*))?@)?(?P<host>[^?/#:]*)(?::(P<port>[\d+]+))?(?P<path>/[^#?]*)?(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?') 31 41 32 __slots__ = ('scheme', 'username', 'password', 'host', 'p ath', 'query',33 ' fragment')42 __slots__ = ('scheme', 'username', 'password', 'host', 'port', '_path', 43 'query', 'fragment') 34 44 35 def __init__(self, uri=None): 36 if uri is not None: 45 def __init__(self, uri=None, scheme='', username='', password='', host='', 46 port='', path='', query={}, fragment=''): 47 self._path = '' 48 # Copy attributes of a URI object 49 if isinstance(uri, URI): 50 from copy import copy 51 self.scheme, self.username, self.password, self.host, self.port, \ 52 self.path, self.query, self.fragment = \ 53 uri.scheme, uri.username, uri.password, uri.host, \ 54 uri.port, uri.path, copy(uri.query), uri.fragment 55 elif uri is not None: 56 # Parse URI string 37 57 from cgi import parse_qs 38 58 … … 40 60 if match is None: 41 61 raise ValueError('Invalid URI') 42 groups = match.groups() 43 groups = groups[0:5] + (parse_qs(groups[5] or ''),) + groups[6:] 44 groups = [group or '' for group in groups] 62 groups = [g or '' for g in match.groups()] 63 groups = map(unquote, groups[0:6]) + \ 64 [parse_qs(groups[6] or '')] + \ 65 map(unquote, groups[7:]) 66 self.scheme, self.username, self.password, self.host, self.port, \ 67 self.path, self.query, self.fragment = groups 45 68 else: 46 groups = [''] * 7 69 # Explicitly provide URI components 70 self.scheme, self.username, self.password, self.host, self.port, \ 71 self.path, self.query, self.fragment = scheme, username, \ 72 password, host, port, path, query, fragment 47 73 48 if not groups[5]: 49 groups[5] = {} 50 self.scheme, self.username, self.password, self.host, self.path, \ 51 self.query, self.fragment = groups 74 def _set_path(self, path): 75 if path: 76 self._path = '/' + posixpath.normpath(path).lstrip('/') 77 else: 78 self._path = '' 79 80 def _get_path(self): 81 return self._path 82 83 path = property(_get_path, _set_path) 52 84 53 85 def __ne__(self, other): … … 55 87 56 88 def __repr__(self): 57 uri = self.scheme + '://'89 uri = self.scheme and (quote(self.scheme) + '://') or '' 58 90 if self.username or self.password: 59 91 if self.username: 60 uri += self.username92 uri += quote(self.username) 61 93 if self.password: 62 uri += ':' + self.password94 uri += ':' + quote(self.password) 63 95 uri += '@' 64 uri += self.host + self.path 96 uri += quote(self.host) 97 if self.port: 98 uri += ':%s' % port 99 uri += quote(self.path) 65 100 if self.query: 66 uri += '?' + '&'.join(['&'.join(['%s=%s' % (k, v) for v in l])101 uri += '?' + '&'.join(['&'.join(['%s=%s' % (k, quote(v)) for v in l]) 67 102 for k, l in sorted(self.query.items())]) 68 103 if self.fragment: 69 uri += '#' + self.fragment104 uri += '#' + quote(self.fragment) 70 105 return uri 71 72 def reduce_text(text, words_re, stemmer=lambda w: w, min_word_length=3,73 max_word_length=64, unique=False):74 """Compact all words in a block of text.75 76 `words_re` is a compiled re object, `stemmer` is a callable returning a77 stemmed word.78 79 If `unique` is true, return a string of **unordered** words with duplicates80 removed."""81 from StringIO import StringIO82 if unique:83 out = set()84 def append(word):85 out.add(word)86 else:87 out = []88 def append(word):89 out.append(word)90 for word in words_re.findall(text):91 # Cull short and long words92 if min_word_length > len(word) > max_word_length:93 continue94 append(stemmer(word))95 return u' '.join(out)96 pyndexter/trunk/.todo
r378 r379 114 114 Add utility function for converting attribute dictionary keys to plain strings (common pattern). 115 115 </note> 116 <note priority="medium" time="1170829158"> 117 Normalise URI usage everything. 118 </note> 119 <note priority="veryhigh" time="1170915596"> 120 Fix port parsing in util.URI. 121 </note> 116 122 </todo>
