Changeset 380
- Timestamp:
- 02/08/07 03:49:14 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/hype.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/hyperestraier.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/lucene.py (modified) (3 diffs)
- pyndexter/trunk/pyndexter/indexers/lupy.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers/pyndex.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers/swishe.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (8 diffs)
- pyndexter/trunk/.todo (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r379 r380 159 159 return iter(self.uris.keys()) 160 160 161 def fetch(self, uri): 162 attributes = self.attributes.get(uri, {}) 163 attributes = dict([(k.encode('utf-8'), v) 164 for k, v in attributes.iteritems()]) 165 attributes['uri'] = uri 166 attributes['quality'] = 0.1 167 return Document(content=' '.join(self.uris.get(uri)), **attributes) 168 161 169 def close(self): 162 170 self.words = None … … 175 183 uris.intersection_update(self.words.get(word)) 176 184 177 return BuiltinResult(self, list(uris))185 return BuiltinResult(self, query, list(uris)) 178 186 179 187 … … 190 198 191 199 def _translate(self, uri): 192 attributes = self.indexer.attributes.get(uri, {}) 200 indexer = self.indexer 201 framework = indexer.framework 202 attributes = indexer.attributes.get(uri, {}) 193 203 attributes['uri'] = uri 194 204 attributes = dict([(k.encode('utf-8'), v) for k, v in attributes.iteritems()]) 195 return Hit( document=self.indexer.framework.fetch, **attributes)205 return Hit(current=framework.fetch, indexed=indexer.fetch, **attributes) pyndexter/trunk/pyndexter/indexers/hype.py
r377 r380 53 53 54 54 def search(self, query): 55 query = query.as_string(not_='ANDNOT ').decode('utf-8') 56 print query 57 search = self.db.search(query) 58 # if order is not None: 59 # search = search.order(order) 60 return HypeResult(self, search, self.enable_scoring) 55 qs = query.as_string(not_='ANDNOT ').decode('utf-8') 56 search = self.db.search(qs) 57 return HypeResult(self, query, search, self.enable_scoring) 61 58 62 59 def optimise(self): … … 75 72 76 73 class HypeResult(Result): 77 def __init__(self, indexer, context, enable_scoring=True):74 def __init__(self, indexer, query, context, enable_scoring=True): 78 75 self.enable_scoring = enable_scoring 79 76 if enable_scoring: 80 77 context = context.scores().option(hype.ESTCONDSCFB) 81 Result.__init__(self, indexer, context)78 Result.__init__(self, indexer, query, context) 82 79 83 80 def __iter__(self): … … 107 104 score = self.context.get_score(index) 108 105 attrs['score'] = score 109 return Hit(document=self.indexer.framework.fetch, **attrs) 106 return Hit(current=self.indexer.framework.fetch, 107 indexed=self.indexer.fetch, **attrs) 110 108 111 109 def _translate_attributes(self, hdoc): pyndexter/trunk/pyndexter/indexers/hyperestraier.py
r377 r380 60 60 def search(self, query): 61 61 phrase = query.as_string(not_='ANDNOT ') 62 return self.hype_search(phrase, simple=False)62 return self.hype_search(phrase, query, simple=False) 63 63 64 64 def optimise(self): … … 73 73 74 74 # Hyperestraier-specific methods 75 def hype_search(self, phrase, simple=True, order=None):75 def hype_search(self, phrase, query, simple=True, order=None): 76 76 """ Full Hyperestraier search phrase. """ 77 77 cond = HyperEstraier.Condition() 78 78 cond.set_phrase(phrase.encode('utf-8')) 79 79 search = self.db.search(cond, 0) 80 # if order is not None: 81 # search = search.order(order) 82 return HyperestraierResult(self, search) 80 return HyperestraierResult(self, query, search) 83 81 84 82 … … 100 98 def _translate(self, id): 101 99 doc = self.indexer.db.get_doc(id, 0) 102 return Hit(document=self.indexer.framework.fetch, 100 return Hit(current=self.indexer.framework.fetch, 101 indexed=self.indexer.fetch, 103 102 **self._translate_attributes(doc)) 104 103 pyndexter/trunk/pyndexter/indexers/lucene.py
r377 r380 43 43 44 44 def search(self, query): 45 query= query.as_string()45 lq = query.as_string() 46 46 searcher = PyLucene.IndexSearcher(self.lucene_store) 47 query = PyLucene.QueryParser('content', self.analyzer).parse(query)47 lq = PyLucene.QueryParser('content', self.analyzer).parse(lq) 48 48 #sort_field = PyLucene.SortField('RELEVANCE', False) 49 49 #sort = PyLucene.Sort(sort_field) … … 52 52 #sort = PyLucene.Sort.INDEXORDER 53 53 #search = searcher.search(query, sort) 54 search = searcher.search( query)55 return LuceneResult(self, search)54 search = searcher.search(lq) 55 return LuceneResult(self, query, search) 56 56 57 57 def optimise(self): … … 85 85 for field in hit.fields(): 86 86 attributes[field.name().encode('utf-8')] = field.stringValue() 87 return Hit(**attributes) 87 return Hit(current=self.indexer.framework.fetch, 88 indexed=self.indexer.fetch, **attributes) pyndexter/trunk/pyndexter/indexers/lupy.py
r377 r380 47 47 searcher = lupy.search.indexsearcher.IndexSearcher(self.db_path) 48 48 hits = searcher.search(lupy_query) 49 return LupyResult(self, hits)49 return LupyResult(self, query, hits) 50 50 51 51 def optimise(self): … … 87 87 fields = dict([(str(k), doc.get(k)) for k in doc.fieldNames]) 88 88 fields['score'] = self.context.score(index) 89 return Hit( document=self.indexer.framework.fetch,90 **fields)89 return Hit(current=self.indexer.framework.fetch, 90 indexed=self.indexer.fetch, **fields) pyndexter/trunk/pyndexter/indexers/pyndex.py
r377 r380 39 39 # FIXME Should probably do a search on each term, and perform set 40 40 # operations. 41 q uery= query.as_string(and_=' ', or_=' ', not_='')42 return PyndexResult(self, self.db.find(query))41 qs = query.as_string(and_=' ', or_=' ', not_='') 42 return PyndexResult(self, query, self.db.find(qs)) 43 43 44 44 def optimise(self): … … 57 57 def __iter__(self): 58 58 for uri in self.context: 59 yield Document(uri=uri.doc.docname, score=uri.score, 60 content=self.indexer.framework.fetch) 59 yield Hit(uri=uri.doc.docname, score=uri.score, 60 current=self.indexer.framework.fetch, 61 indexed=self.indexer.fetch) pyndexter/trunk/pyndexter/indexers/swishe.py
r374 r380 23 23 def search(self, query): 24 24 results = self.db.query(query.phrase) 25 return SwishEResult(self, results)25 return SwishEResult(self, query, results) 26 26 27 27 … … 30 30 for row in self.context: 31 31 uri = row.getproperty('swishdocpath') 32 yield Hit( document=self.indexer.framework.fetch,33 uri=uri)32 yield Hit(current=self.indexer.framework.fetch, 33 indexed=self.indexer.fetch, uri=uri) 34 34 35 35 def len(self): pyndexter/trunk/pyndexter/indexers/xapian.py
r379 r380 78 78 79 79 query_parser = xapian.QueryParser() 80 query= query_parser.parse_query(query.as_string().encode('utf-8').lower())80 xq = query_parser.parse_query(query.as_string().encode('utf-8').lower()) 81 81 enquire = xapian.Enquire(self.db) 82 enquire.set_query( query)83 return XapianResult(self, enquire)82 enquire.set_query(xq) 83 return XapianResult(self, query, enquire) 84 84 85 85 … … 113 113 uri = terms.next()[0][1:] 114 114 assert uri, 'uniQue term (URI) not found in document term list' 115 return Hit(uri, document=self.indexer.framework.fetch, 115 return Hit(uri, 116 current=self.indexer.framework.fetch, 117 indexed=self.indexer.fetch, 116 118 did=hit[xapian.MSET_DID], 117 119 score=float(hit[xapian.MSET_PERCENT]) / 100.0) pyndexter/trunk/pyndexter/__init__.py
r379 r380 125 125 fetched from. """ 126 126 127 __slots__ = ('attributes', '_content', 'source' )127 __slots__ = ('attributes', '_content', 'source', 'quality') 128 128 129 129 def __init__(self, uri, content=None, source=None, changed=None, 130 **attributes):130 quality=1.0, **attributes): 131 131 self._content = content 132 132 self.source = source 133 self.quality = quality 133 134 self.attributes = attributes 134 135 self.attributes.update({'uri': uri, 'changed': changed}) … … 415 416 return QueryNode(QueryNode.TERM, value=token[1]) 416 417 raise InvalidQuery('Expected terminal, got "%s"' % tokens[0][1]) 418 419 def terms(self, exclude_not=True): 420 """A generator returning the terms contained in the Query.""" 421 def _convert(node): 422 if not node: 423 return 424 if node.type == node.TERM: 425 yield node.value 426 elif node.type == node.NOT and exclude_not: 427 return 428 else: 429 for child in _convert(node.left): 430 yield child 431 for child in _convert(node.right): 432 yield child 433 434 return _convert(self) 417 435 418 436 def as_string(self, and_=' AND ', or_=' OR ', not_='NOT '): … … 454 472 return 455 473 if node.type == node.TERM: 456 node.value = reduce(node.value )474 node.value = reduce(node.value, unique=False, split=False) 457 475 _reduce(node.left) 458 476 _reduce(node.right) … … 598 616 raise NotImplementedError 599 617 618 def fetch(self, uri): 619 """Attempt to fetch indexer representation of the document. 620 621 Must return a `Document` object with a `quality` attribute between 0.0 622 and 1.0, representing the quality of the document in comparison to the 623 original.""" 624 raise DocumentNotFound(uri) 625 600 626 def replace(self, document): 601 627 """Replace a document in the index. Default is to `discard()` and … … 841 867 `query` is a pyndexter compatible search string. 842 868 843 Returns a `Search` object. """ 844 query = Query(query) 869 Returns a `Result` object. """ 870 if isinstance(query, basestring): 871 query = Query(query) 845 872 return self.indexer.search(query) 846 873 … … 886 913 object.""" 887 914 888 def __init__(self, indexer, context):915 def __init__(self, indexer, query, context): 889 916 self.indexer = indexer 917 self.query = query 890 918 self.context = context 891 919 … … 914 942 915 943 class Hit(object): 916 """ Wrapper around a search hit. If ` document` is a callable, it should944 """ Wrapper around a search hit. If `current` is a callable, it should 917 945 be a function that fetches the Document associated with `uri`, which is 918 946 passed as the only argument. """ 919 947 920 __slots__ = ('attributes', '_document') 921 922 def __init__(self, uri, document=None, **attributes): 923 self._document = document 948 __slots__ = ('attributes', '_current', '_indexed') 949 950 def __init__(self, uri, current=None, indexed=None, **attributes): 951 self._current = current 952 self._indexed = indexed 924 953 self.attributes = attributes 925 if isinstance(uri, basestring):926 from pyndexter.util import URI927 uri = URI(uri)954 # if isinstance(uri, basestring): 955 # from pyndexter.util import URI 956 # uri = URI(uri) 928 957 self.attributes['uri'] = uri 929 958 930 959 def get(self, key, default=None): 960 """Get an attribute, but if it doesn't exist return a default value.""" 931 961 return self.attributes.get(key, default) 962 963 def excerpt(self, terms, max_len=240, fuzz=60): 964 """Generate an Excerpt from this Hit.""" 965 try: 966 current = True 967 doc = self.current 968 except: 969 current = False 970 doc = self.indexed 971 return Excerpt(doc, terms, max_len, fuzz, current) 932 972 933 973 def __getattr__(self, key): … … 946 986 self.attributes.iteritems()]) 947 987 948 def _get_document(self): 949 """Fetch Document object using callback.""" 950 if callable(self._document): 951 self._document = self._document(self.uri) 952 return self._document 953 document = property(_get_document) 988 def _get_current(self): 989 """Fetch current Document (if possible).""" 990 if callable(self._current): 991 self._current = self._current(self.uri) 992 return self._current 993 current = property(_get_current) 994 995 def _get_indexed(self): 996 """Fetch Indexer representation of Document (if possible).""" 997 if callable(self._indexed): 998 self._indexed = self._indexed(self.uri) 999 return self._indexed 1000 indexed = property(_get_indexed) 1001 1002 1003 class Excerpt(object): 1004 """Generate an excerpt of a Document.""" 1005 def __init__(self, doc, terms, max_len=240, fuzz=60, current=True): 1006 self.text = self._shorten(doc.content, terms, max_len, fuzz) 1007 self.quality = doc.quality 1008 self.current = current 1009 1010 def _shorten(self, text, terms, max_len=240, fuzz=60): 1011 # FIXME Take into account stemming 1012 # FIXME Take into account whole-word only search, or 1013 # wild-card...etc.??? Tricky. 1014 text_low = text.lower() 1015 beg = -1 1016 for k in terms: 1017 i = text_low.find(k.lower()) 1018 if (i > -1 and i < beg) or beg == -1: 1019 beg = i 1020 excerpt_beg = 0 1021 if beg > fuzz: 1022 for sep in ('.', ':', ';', '='): 1023 eb = text.find(sep, beg - fuzz, beg - 1) 1024 if eb > -1: 1025 eb += 1 1026 break 1027 else: 1028 eb = beg - fuzz 1029 excerpt_beg = eb 1030 if excerpt_beg < 0: 1031 excerpt_beg = 0 1032 msg = text[excerpt_beg:beg+max_len] 1033 if beg > fuzz: 1034 msg = '... ' + msg 1035 if beg < len(text)-max_len: 1036 msg = msg + ' ...' 1037 return msg 1038 1039 def __repr__(self): 1040 return self.text 954 1041 955 1042 pyndexter/trunk/.todo
r379 r380 115 115 </note> 116 116 <note priority="medium" time="1170829158"> 117 Normalise URI usage every thing.117 Normalise URI usage everywhere. 118 118 </note> 119 119 <note priority="veryhigh" time="1170915596">
