| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# Copyright (C) 2006 Alec Thomas <alec@swapoff.org> |
|---|
| 4 |
# |
|---|
| 5 |
# This software is licensed as described in the file COPYING, which |
|---|
| 6 |
# you should have received as part of this distribution. |
|---|
| 7 |
# |
|---|
| 8 |
|
|---|
| 9 |
""" |
|---|
| 10 |
Xapian |
|---|
| 11 |
------ |
|---|
| 12 |
|
|---|
| 13 |
Adapter for `Xapian <http://www.xapian.org>`_, a fast full-text indexing |
|---|
| 14 |
engine. |
|---|
| 15 |
|
|---|
| 16 |
Usage |
|---|
| 17 |
~~~~~ |
|---|
| 18 |
|
|---|
| 19 |
:: |
|---|
| 20 |
|
|---|
| 21 |
xapian://<path> |
|---|
| 22 |
|
|---|
| 23 |
Installation |
|---|
| 24 |
~~~~~~~~~~~~ |
|---|
| 25 |
|
|---|
| 26 |
Install Xapian for your distribution (typically the package ``xapian-core``). |
|---|
| 27 |
|
|---|
| 28 |
If your distribution also includes the SWIG bindings, install these, otherwise: |
|---|
| 29 |
|
|---|
| 30 |
:: |
|---|
| 31 |
|
|---|
| 32 |
wget http://www.oligarchy.co.uk/xapian/0.9.9/xapian-bindings-0.9.9.tar.gz |
|---|
| 33 |
tar xfzv xapian-bindings-0.9.9.tar.gz |
|---|
| 34 |
cd xapian-bindings-0.9.9 |
|---|
| 35 |
./configure |
|---|
| 36 |
make |
|---|
| 37 |
make install |
|---|
| 38 |
""" |
|---|
| 39 |
|
|---|
| 40 |
import os |
|---|
| 41 |
import re |
|---|
| 42 |
from pyndexter import * |
|---|
| 43 |
import xapian |
|---|
| 44 |
|
|---|
| 45 |
|
|---|
| 46 |
__all__ = ['XapianIndexer', 'XapianResult'] |
|---|
| 47 |
|
|---|
| 48 |
|
|---|
| 49 |
class XapianIndexer(Indexer): |
|---|
| 50 |
def __init__(self, framework, path): |
|---|
| 51 |
Indexer.__init__(self, framework) |
|---|
| 52 |
|
|---|
| 53 |
framework.reduce.split = True |
|---|
| 54 |
|
|---|
| 55 |
path = path.encode('utf-8') |
|---|
| 56 |
self.path = path |
|---|
| 57 |
self.xapian_path = os.path.join(path, 'xapian.db') |
|---|
| 58 |
self.state_path = os.path.join(path, 'state.db') |
|---|
| 59 |
|
|---|
| 60 |
if self.framework.mode == READWRITE: |
|---|
| 61 |
if not os.path.exists(self.xapian_path): |
|---|
| 62 |
os.makedirs(self.xapian_path) |
|---|
| 63 |
self.db = xapian.flint_open(self.xapian_path, |
|---|
| 64 |
xapian.DB_CREATE_OR_OPEN) |
|---|
| 65 |
else: |
|---|
| 66 |
self.db = xapian.flint_open(self.xapian_path) |
|---|
| 67 |
|
|---|
| 68 |
def index(self, document): |
|---|
| 69 |
doc = xapian.Document() |
|---|
| 70 |
|
|---|
| 71 |
# FIXME Xapian doesn't support UTF-8 yet. "Coming soon." |
|---|
| 72 |
content = document.content.encode('utf-8') |
|---|
| 73 |
uri = unicode(document.uri).encode('utf-8') |
|---|
| 74 |
|
|---|
| 75 |
doc.set_data(content) |
|---|
| 76 |
|
|---|
| 77 |
doc.add_term('Q' + uri) |
|---|
| 78 |
|
|---|
| 79 |
words = self.framework.reduce(content) |
|---|
| 80 |
for word in words: |
|---|
| 81 |
doc.add_posting(word, 0) |
|---|
| 82 |
|
|---|
| 83 |
self.db.replace_document('Q' + uri, doc) |
|---|
| 84 |
|
|---|
| 85 |
replace = index |
|---|
| 86 |
|
|---|
| 87 |
def discard(self, uri): |
|---|
| 88 |
self.db.delete_document('Q' + unicode(uri).encode('utf-8')) |
|---|
| 89 |
|
|---|
| 90 |
def fetch(self, uri): |
|---|
| 91 |
term = 'Q' + unicode(uri).encode('utf-8') |
|---|
| 92 |
for docid in self.db.postlist(term): |
|---|
| 93 |
doc = self.db.get_document(docid[0]) |
|---|
| 94 |
# TODO fetch attributes |
|---|
| 95 |
return Document(uri=uri, content=doc.get_data().decode('utf-8'), |
|---|
| 96 |
quality=0.95) |
|---|
| 97 |
raise DocumentNotFound(uri) |
|---|
| 98 |
|
|---|
| 99 |
def __iter__(self): |
|---|
| 100 |
terms = self.db.allterms() |
|---|
| 101 |
terms.skip_to('Q') |
|---|
| 102 |
for term in terms: |
|---|
| 103 |
if term[0][0] != 'Q': |
|---|
| 104 |
return |
|---|
| 105 |
yield term[0][1:].decode('utf-8') |
|---|
| 106 |
|
|---|
| 107 |
def flush(self): |
|---|
| 108 |
self.db.flush() |
|---|
| 109 |
|
|---|
| 110 |
def close(self): |
|---|
| 111 |
self.flush() |
|---|
| 112 |
#self.db.close() |
|---|
| 113 |
self.db = None |
|---|
| 114 |
|
|---|
| 115 |
def search(self, query): |
|---|
| 116 |
|
|---|
| 117 |
# Fake stemmer to use the frameworks |
|---|
| 118 |
framework = self.framework |
|---|
| 119 |
query.reduce(self.framework.reduce) |
|---|
| 120 |
query_parser = xapian.QueryParser() |
|---|
| 121 |
xq = query_parser.parse_query(query.as_string().encode('utf-8').lower()) |
|---|
| 122 |
enquire = xapian.Enquire(self.db) |
|---|
| 123 |
enquire.set_query(xq) |
|---|
| 124 |
return XapianResult(self, query, enquire) |
|---|
| 125 |
|
|---|
| 126 |
|
|---|
| 127 |
indexer_factory = PluginFactory(XapianIndexer, max_word_length=int) |
|---|
| 128 |
|
|---|
| 129 |
|
|---|
| 130 |
class XapianResult(Result): |
|---|
| 131 |
def __iter__(self): |
|---|
| 132 |
matches = self.context.get_mset(0, 20) |
|---|
| 133 |
for hit in matches: |
|---|
| 134 |
yield self._translate(hit) |
|---|
| 135 |
|
|---|
| 136 |
def __getitem__(self, index): |
|---|
| 137 |
matches = self.context.get_mset(index, 1) |
|---|
| 138 |
for hit in matches: |
|---|
| 139 |
return self._translate(hit) |
|---|
| 140 |
return matches.next() |
|---|
| 141 |
|
|---|
| 142 |
def __getslice__(self, i, j): |
|---|
| 143 |
for hit in self.context.get_mset(i, j - i): |
|---|
| 144 |
yield self._translate(hit) |
|---|
| 145 |
|
|---|
| 146 |
def __len__(self): |
|---|
| 147 |
return len(self.context) |
|---|
| 148 |
|
|---|
| 149 |
# Internal methods |
|---|
| 150 |
def _translate(self, hit): |
|---|
| 151 |
doc = hit[xapian.MSET_DOCUMENT] |
|---|
| 152 |
terms = doc.termlist() |
|---|
| 153 |
terms.skip_to('Q') |
|---|
| 154 |
uri = terms.next()[0][1:] |
|---|
| 155 |
assert uri, 'uniQue term (URI) not found in document term list' |
|---|
| 156 |
return Hit(URI(uri), |
|---|
| 157 |
current=self.indexer.framework.fetch, |
|---|
| 158 |
indexed=self.indexer.fetch, |
|---|
| 159 |
did=hit[xapian.MSET_DID], |
|---|
| 160 |
score=float(hit[xapian.MSET_PERCENT]) / 100.0) |
|---|