Changeset 383
- Timestamp:
- 02/09/07 14:07:35 (2 years ago)
- Files:
-
- pyndexter/trunk/pyndexter/indexers/builtin.py (modified) (11 diffs)
- pyndexter/trunk/pyndexter/indexers/xapian.py (modified) (2 diffs)
- pyndexter/trunk/pyndexter/__init__.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pyndexter/trunk/pyndexter/indexers/builtin.py
r380 r383 17 17 import anydbm 18 18 import cPickle as pickle 19 import md5 19 20 from UserDict import DictMixin 21 from StringIO import StringIO 22 from gzip import GzipFile 20 23 from pyndexter import * 21 24 from pyndexter.util import set … … 27 30 28 31 def update(self, key, values): 29 key = key.encode('utf-8')32 key = pickle.dumps(key, 2) 30 33 try: 31 34 v = pickle.loads(self.db[key]) … … 36 39 37 40 def remove(self, key, values=None): 38 key = key.encode('utf-8')41 key = pickle.dumps(key, 2) 39 42 if values is None: 40 43 try: … … 51 54 52 55 def replace(self, key, values): 53 key = key.encode('utf-8')56 key = pickle.dumps(key, 2) 54 57 self.db[key] = pickle.dumps(values, 2) 55 58 56 59 def get(self, key): 57 key = key.encode('utf-8')60 key = pickle.dumps(key, 2) 58 61 try: 59 62 return pickle.loads(self.db[key]) … … 62 65 63 66 def keys(self): 64 return self.db.keys() 67 for key in self.db.keys(): 68 yield pickle.loads(key) 65 69 66 70 … … 86 90 """Constructor URI is: 87 91 88 builtin://<path>/?words=<regex>&dbm=<dbm> 89 90 eg. 91 92 builtin:///tmp/builtin.idx?dbm=gdbm 92 builtin://<path>/?dbm=<dbm>&cache=<bool>&compact=<bool> 93 93 94 94 Supported dbm's are `anydbm`, `dbhash`, `gdbm` and `dbm` (Python 2.5). 95 95 `anydbm` is the default. 96 96 97 If `cache` is specified, the full original text of each document will 98 be stored in the index (large). 99 100 If `compact` is True, each word in the index is given a numeric ID. 101 Currently this has a fairly large performance impact, but it does 102 reduce the size of the index considerably. 103 104 eg. 105 106 builtin:///tmp/builtin.idx?dbm=gdbm 107 97 108 """ 98 def __init__(self, framework, path, dbm='anydbm'): 109 def __init__(self, framework, path, dbm='anydbm', cache=False, 110 compact=False): 99 111 Indexer.__init__(self, framework) 100 112 101 113 self.path = path 114 self.compact = compact 115 self.cache = cache 102 116 self.state_path = os.path.join(path, 'store.db') 103 117 self.db_path = os.path.join(path, 'builtin.db') 104 118 119 # We want the minimum set of words 105 120 framework.reduce.split = True 106 121 framework.reduce.unique = True … … 115 130 mode = 'r' 116 131 117 # word:set(uri) 118 self.words = KeyedSet(dbm.open(os.path.join(self.db_path, 'words'), mode)) 119 # uri:set(word) 120 self.uris = KeyedSet(dbm.open(os.path.join(self.db_path, 'uris'), mode)) 121 # attribute:dict(attributes) 122 self.attributes = PickleDict(dbm.open(os.path.join(self.db_path, 123 'attributes'), mode)) 132 def dbopen(name): 133 return dbm.open(os.path.join(self.db_path, name), mode) 134 135 # wordid:set(uriid) 136 self.words = KeyedSet(dbopen('words')) 137 # uriid:set(wordid) 138 self.uris = KeyedSet(dbopen('uris')) 139 # uri:dict(attributes) 140 self.attributes = PickleDict(dbopen('attributes')) 141 142 if cache: 143 self.cachedb = PickleDict(dbopen('cache')) 144 145 if compact: 146 # id:word mapping 147 self.idword = PickleDict(dbopen('idword')) 148 # word:id mapping 149 self.wordid = PickleDict(dbopen('wordid')) 150 # key:value config 151 self.config = PickleDict(dbopen('config')) 152 153 self.config.setdefault('wordid', 0) 154 else: 155 self._words = self._wids = lambda w: set(w) 156 self._word = self._wid = lambda w: w 124 157 125 158 def index(self, document): 126 self.attributes[document.uri] = document.attributes 127 128 words = self.framework.reduce(document.content) 129 doc_set = set([document.uri]) 159 160 uri = self._wid(document.uri) 161 words = self._wids(self.framework.reduce(document.content)) 162 doc_set = set([uri]) 163 164 if self.cache: 165 if self.compact: 166 # Do a low-compression gzip 167 buffer = StringIO() 168 gz = GzipFile(fileobj=buffer, compresslevel=1, mode='wb') 169 gz.write(document.content.encode('utf-8')) 170 gz.close() 171 self.cachedb[uri] = buffer.getvalue() 172 else: 173 self.cachedb[uri] = document.content 174 175 self.attributes[uri] = document.attributes 130 176 131 177 old_words = self.words.get(document.uri) … … 141 187 self.words.update(word, doc_set) 142 188 143 self.uris.replace( document.uri, words)189 self.uris.replace(uri, words) 144 190 145 191 def discard(self, uri): … … 157 203 158 204 def __iter__(self): 159 return iter(self.uris.keys()) 205 for uri in self.uris.keys(): 206 yield self._word(uri) 160 207 161 208 def fetch(self, uri): 162 attributes = self.attributes.get(uri, {}) 209 uriid = self._wid(uri) 210 attributes = self.attributes.get(uriid, {}) 163 211 attributes = dict([(k.encode('utf-8'), v) 164 212 for k, v in attributes.iteritems()]) 165 213 attributes['uri'] = uri 166 attributes['quality'] = 0.1 167 return Document(content=' '.join(self.uris.get(uri)), **attributes) 214 if self.cache: 215 content = self.cachedb[uriid] 216 if self.compact: 217 gz = GzipFile(fileobj=StringIO(content), mode='rb') 218 content = gz.read().decode('utf-8') 219 quality = 0.99 220 else: 221 content = ' '.join(self._words(self.uris.get(uriid))) 222 quality = 0.1 223 return Document(content=content, quality=quality, **attributes) 168 224 169 225 def close(self): … … 171 227 self.uris = None 172 228 self.attributes = None 229 self.wordid = None 230 self.idword = None 231 self.config = None 173 232 174 233 def search(self, query): 175 # FIXME currently simply finding the intersection of all documents (AND) 234 # FIXME currently simply finding the intersection of all documents 235 # (AND) 176 236 query.reduce(self.framework.reduce) 177 words = query.as_string(and_=' ', or_=' ', not_=' ').split() 237 # FIXME Words without a WID can be automatically excluded from the 238 # search 239 words = self._wids(query.as_string(and_=' ', or_=' ', not_=' ').split()) 178 240 uris = None 179 241 for word in words: … … 183 245 uris.intersection_update(self.words.get(word)) 184 246 185 return BuiltinResult(self, query, list(uris)) 186 187 188 indexer_factory = PluginFactory(BuiltinIndexer) 247 return BuiltinResult(self, query, list(self._words(uris))) 248 249 # Internal methods 250 def _wids(self, words): 251 """Convert a collection of words to a set of wids.""" 252 out = set() 253 for word in words: 254 out.add(self._wid(word)) 255 return out 256 257 def _words(self, wids): 258 """Convert a collection of wids to words.""" 259 out = set() 260 for wid in wids: 261 out.add(self.idword[wid]) 262 return out 263 264 def _wid(self, word): 265 """Return, or allocate, a unique word identifier.""" 266 try: 267 return self.wordid[word] 268 except KeyError: 269 id = self.config['wordid'] 270 self.config['wordid'] = id + 1 271 self.wordid[word] = id 272 self.idword[id] = word 273 return id 274 275 def _word(self, wid): 276 return self.idword[wid] 277 278 279 indexer_factory = PluginFactory(BuiltinIndexer, cache=bool, compact=bool) 189 280 190 281 pyndexter/trunk/pyndexter/indexers/xapian.py
r381 r383 61 61 self.db.delete_document('Q' + uri.encode('utf-8')) 62 62 63 # def fetch(self, uri): 64 # terms = self.db.allterms() 65 # terms.skip_to('Q' + uri.encode('utf-8')) 66 # term = terms.next() 67 # print term 68 # doc = self.db.get_document(term[1]) 69 # print 'monkey' in doc.get_data().lower() 70 # return Document(uri=uri, content=doc.get_data().decode('utf-8'), 71 # quality=0.95) 63 def fetch(self, uri): 64 term = 'Q' + uri.encode('utf-8') 65 for docid in self.db.postlist(term): 66 doc = self.db.get_document(docid[0]) 67 # TODO fetch attributes 68 return Document(uri=uri, content=doc.get_data().decode('utf-8'), 69 quality=0.95) 72 70 73 71 def __iter__(self): … … 83 81 84 82 def close(self): 85 self. sync()83 self.flush() 86 84 #self.db.close() 87 85 self.db = None pyndexter/trunk/pyndexter/__init__.py
r382 r383 664 664 """ 665 665 666 BOOL_TRUE = ('1', 'true', 'yes', 'on', 'aye') 667 666 668 class List(object): 667 669 """Translate a parameter that is a list of elements of `type`, … … 731 733 type = self.arg_types.get(k, lambda v: v) 732 734 # If it's a list, and not marked as such, convert it to a scalar 733 if isinstance(v, (tuple, list)) and not isinstance(type, self.List): 735 if isinstance(v, (tuple, list)) and not \ 736 isinstance(type, self.List): 734 737 if len(v) != 1: 735 738 raise ValueError('argument "%s" should be a scalar' % k) 736 739 v = v[0] 737 try: 738 args[k] = type(v) 739 except ValueError, e: 740 raise ValueError('could not coerce argument "%s" with ' 741 'value "%s" to type "%s": %s' 742 % (k, v, type, e)) 740 if type is bool: 741 # Special-case bool 742 if str(v) in self.BOOL_TRUE: 743 args[k] = True 744 else: 745 try: 746 args[k] = bool(float(v)) 747 except ValueError: 748 args[k] = False 749 else: 750 try: 751 args[k] = type(v) 752 except ValueError, e: 753 raise ValueError('could not coerce argument "%s" with ' 754 'value "%s" to type "%s": %s' 755 % (k, v, type, e)) 743 756 744 757 return self.plugin(**args)
