Changeset 381
- Timestamp:
- 02/08/07 19:54:27 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/hype.py (modified) (4 diffs)
- pyndexter/trunk/pyndexter/indexers/hyperestraier.py (modified) (5 diffs)
- pyndexter/trunk/pyndexter/indexers/lucene.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/indexers/pyndex.py (modified) (1 diff)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/hype.py
r380 r381 43 43 if k != 'uri': 44 44 hdoc['@' + k] = unicode(v) 45 hdoc.add_text(document.content) 45 for line in document.content.splitlines(): 46 hdoc.add_text(line) 46 47 self.db.put_doc(hdoc) 47 48 … … 60 61 self.db.optimize() 61 62 62 def sync(self): 63 def fetch(self, uri): 64 doc = self.db.get_doc_by_uri(uri) 65 if not doc: 66 raise DocumentNotFound(uri) 67 attributes = self._translate_attributes(doc) 68 return Document(content='\n'.join(doc.texts), quality=0.99, **attributes) 69 70 def flush(self): 63 71 self.db.sync() 72 self.db.flush() 64 73 65 74 def close(self): 66 75 self.db = None 76 77 # Internal methods 78 def _translate_attributes(self, hdoc): 79 attributes = {} 80 for k in hdoc.attributes: 81 if k[0] == '@': 82 attributes[k[1:]] = hdoc.get(k) 83 else: 84 attributes[k] = hdoc.get(k) 85 return attributes 67 86 68 87 … … 99 118 # Internal methods 100 119 def _translate(self, doc, score=None): 101 attrs = self. _translate_attributes(doc)120 attrs = self.indexer._translate_attributes(doc) 102 121 if self.enable_scoring: 103 122 if score is None: … … 106 125 return Hit(current=self.indexer.framework.fetch, 107 126 indexed=self.indexer.fetch, **attrs) 108 109 def _translate_attributes(self, hdoc):110 attributes = {}111 for k in hdoc.attributes:112 if k[0] == '@':113 attributes[k[1:]] = hdoc.get(k)114 else:115 attributes[k] = hdoc.get(k)116 return attributes117 pyndexter/trunk/pyndexter/indexers/hyperestraier.py
r380 r381 48 48 hdoc.add_attr(unicode('@' + k).encode('utf-8'), 49 49 unicode(v).encode('utf-8')) 50 hdoc.add_text(document.content.encode('utf-8')) 50 for line in document.content.splitlines(): 51 hdoc.add_text(line.encode('utf-8')) 51 52 self.db.put_doc(hdoc, 1) 52 53 … … 58 59 self.db.out_doc(id, HyperEstraier.Database.ODCLEAN) 59 60 61 def fetch(self, uri): 62 uri = uri.encode('utf-8') 63 id = self.db.uri_to_id(uri) 64 if id == -1: 65 raise DocumentNotFound(uri) 66 doc = self.db.get_doc(id, 0) 67 attributes = self._translate_attributes(doc) 68 return Document(content=u'\n'.join([t.decode('utf-8') 69 for t in doc.texts()]), 70 quality=0.99, 71 **attributes) 72 60 73 def search(self, query): 61 74 phrase = query.as_string(not_='ANDNOT ') … … 65 78 self.db.optimize() 66 79 67 def sync(self):80 def flush(self): 68 81 self.db.sync() 69 82 … … 79 92 search = self.db.search(cond, 0) 80 93 return HyperestraierResult(self, query, search) 94 95 # Internal methods 96 def _translate_attributes(self, hdoc): 97 attributes = {} 98 for k in hdoc.attr_names(): 99 if k[0] == '@': 100 attributes[k[1:]] = hdoc.attr(k).decode('utf-8') 101 else: 102 attributes[k] = hdoc.attr(k).decode('utf-8') 103 return attributes 104 81 105 82 106 … … 100 124 return Hit(current=self.indexer.framework.fetch, 101 125 indexed=self.indexer.fetch, 102 **self._translate_attributes(doc)) 103 104 def _translate_attributes(self, hdoc): 105 attributes = {} 106 for k in hdoc.attr_names(): 107 if k[0] == '@': 108 attributes[k[1:]] = hdoc.attr(k).decode('utf-8') 109 else: 110 attributes[k] = hdoc.attr(k).decode('utf-8') 111 return attributes 112 126 **self.indexer._translate_attributes(doc)) pyndexter/trunk/pyndexter/indexers/lucene.py
r380 r381 58 58 self.writer.optimize() 59 59 60 def sync(self):60 def flush(self): 61 61 try: 62 62 # XXX Assume this will make it into the Lucene bindings pyndexter/trunk/pyndexter/indexers/pyndex.py
r380 r381 45 45 self.db.optimize() 46 46 47 def sync(self):47 def flush(self): 48 48 self.db.commit() 49 49 pyndexter/trunk/pyndexter/indexers/xapian.py
r380 r381 61 61 self.db.delete_document('Q' + uri.encode('utf-8')) 62 62 63 def sync(self): 63 # def fetch(self, uri): 64 # terms = self.db.allterms() 65 # terms.skip_to('Q' + uri.encode('utf-8')) 66 # term = terms.next() 67 # print term 68 # doc = self.db.get_document(term[1]) 69 # print 'monkey' in doc.get_data().lower() 70 # return Document(uri=uri, content=doc.get_data().decode('utf-8'), 71 # quality=0.95) 72 73 def __iter__(self): 74 terms = self.db.allterms() 75 terms.skip_to('Q') 76 for term in terms: 77 if term[0][0] != 'Q': 78 return 79 yield term[0][1:].decode('utf-8') 80 81 def flush(self): 64 82 self.db.flush() 65 83 … … 73 91 # Fake stemmer to use the frameworks 74 92 framework = self.framework 75 class StemmerWrapper(xapian.Stem): 76 def stem_word(self, word): 77 return framework.reduce.stemmer(word) 78 93 query.reduce(self.framework.reduce) 79 94 query_parser = xapian.QueryParser() 80 95 xq = query_parser.parse_query(query.as_string().encode('utf-8').lower())
