root/pyndexter/trunk/pyndexter/indexers/_xapian.py

Revision 452, 4.2 kB (checked in by athomas, 1 year ago)

pyndexter: All modules are now prefixed with _ to avoid import collisions. Updated unit tests.

Line 
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Alec Thomas <alec@swapoff.org>
4 #
5 # This software is licensed as described in the file COPYING, which
6 # you should have received as part of this distribution.
7 #
8
9 """
10 Xapian
11 ------
12
13 Adapter for `Xapian <http://www.xapian.org>`_, a fast full-text indexing
14 engine.
15
16 Usage
17 ~~~~~
18
19 ::
20
21     xapian://<path>
22
23 Installation
24 ~~~~~~~~~~~~
25
26 Install Xapian for your distribution (typically the package ``xapian-core``).
27
28 If your distribution also includes the SWIG bindings, install these, otherwise:
29
30 ::
31
32     wget http://www.oligarchy.co.uk/xapian/0.9.9/xapian-bindings-0.9.9.tar.gz
33     tar xfzv xapian-bindings-0.9.9.tar.gz
34     cd xapian-bindings-0.9.9
35     ./configure
36     make
37     make install
38 """
39
40 import os
41 import re
42 from pyndexter import *
43 import xapian
44
45
46 __all__ = ['XapianIndexer', 'XapianResult']
47
48
49 class XapianIndexer(Indexer):
50     def __init__(self, framework, path):
51         Indexer.__init__(self, framework)
52
53         framework.reduce.split = True
54
55         path = path.encode('utf-8')
56         self.path = path
57         self.xapian_path = os.path.join(path, 'xapian.db')
58         self.state_path = os.path.join(path, 'state.db')
59
60         if self.framework.mode == READWRITE:
61             if not os.path.exists(self.xapian_path):
62                 os.makedirs(self.xapian_path)
63             self.db = xapian.flint_open(self.xapian_path,
64                                         xapian.DB_CREATE_OR_OPEN)
65         else:
66             self.db = xapian.flint_open(self.xapian_path)
67
68     def index(self, document):
69         doc = xapian.Document()
70
71         # FIXME Xapian doesn't support UTF-8 yet. "Coming soon."
72         content = document.content.encode('utf-8')
73         uri = unicode(document.uri).encode('utf-8')
74
75         doc.set_data(content)
76
77         doc.add_term('Q' + uri)
78
79         words = self.framework.reduce(content)
80         for word in words:
81             doc.add_posting(word, 0)
82
83         self.db.replace_document('Q' + uri, doc)
84
85     replace = index
86
87     def discard(self, uri):
88         self.db.delete_document('Q' + unicode(uri).encode('utf-8'))
89
90     def fetch(self, uri):
91         term = 'Q' + unicode(uri).encode('utf-8')
92         for docid in self.db.postlist(term):
93             doc = self.db.get_document(docid[0])
94             # TODO fetch attributes
95             return Document(uri=uri, content=doc.get_data().decode('utf-8'),
96                             quality=0.95)
97         raise DocumentNotFound(uri)
98
99     def __iter__(self):
100         terms = self.db.allterms()
101         terms.skip_to('Q')
102         for term in terms:
103             if term[0][0] != 'Q':
104                 return
105             yield term[0][1:].decode('utf-8')
106
107     def flush(self):
108         self.db.flush()
109
110     def close(self):
111         self.flush()
112         #self.db.close()
113         self.db = None
114
115     def search(self, query):
116
117         # Fake stemmer to use the frameworks
118         framework = self.framework
119         query.reduce(self.framework.reduce)
120         query_parser = xapian.QueryParser()
121         xq = query_parser.parse_query(query.as_string().encode('utf-8').lower())
122         enquire = xapian.Enquire(self.db)
123         enquire.set_query(xq)
124         return XapianResult(self, query, enquire)
125
126
127 indexer_factory = PluginFactory(XapianIndexer, max_word_length=int)
128
129
130 class XapianResult(Result):
131     def __iter__(self):
132         matches = self.context.get_mset(0, 20)
133         for hit in matches:
134             yield self._translate(hit)
135
136     def __getitem__(self, index):
137         matches = self.context.get_mset(index, 1)
138         for hit in matches:
139             return self._translate(hit)
140         return matches.next()
141
142     def __getslice__(self, i, j):
143         for hit in self.context.get_mset(i, j - i):
144             yield self._translate(hit)
145
146     def __len__(self):
147         return len(self.context)
148
149     # Internal methods
150     def _translate(self, hit):
151         doc = hit[xapian.MSET_DOCUMENT]
152         terms = doc.termlist()
153         terms.skip_to('Q')
154         uri = terms.next()[0][1:]
155         assert uri, 'uniQue term (URI) not found in document term list'
156         return Hit(URI(uri),
157                    current=self.indexer.framework.fetch,
158                    indexed=self.indexer.fetch,
159                    did=hit[xapian.MSET_DID],
160                    score=float(hit[xapian.MSET_PERCENT]) / 100.0)
Note: See TracBrowser for help on using the browser.