Test generating and querying Xapian indexes.
This commit is contained in:
parent
bdf397335c
commit
16657d95ee
40
scripts/brain-query
Executable file
40
scripts/brain-query
Executable file
@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import xapian
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
def main(path, query):
|
||||||
|
db = xapian.Database(path, xapian.DB_OPEN)
|
||||||
|
docid_map_path = os.path.join(path, "docid_map.json")
|
||||||
|
with open(docid_map_path, 'rt') as f:
|
||||||
|
docid_to_node = json.load(f)
|
||||||
|
|
||||||
|
qp = xapian.QueryParser()
|
||||||
|
stemmer = xapian.Stem("english")
|
||||||
|
qp.set_stemmer(stemmer)
|
||||||
|
qp.set_database(db)
|
||||||
|
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
|
||||||
|
xap_query = qp.parse_query(query)
|
||||||
|
print("Parsed query is: {}".format(xap_query))
|
||||||
|
|
||||||
|
enquire = xapian.Enquire(db)
|
||||||
|
enquire.set_query(xap_query)
|
||||||
|
matches = enquire.get_mset(0, 10)
|
||||||
|
for match in matches:
|
||||||
|
print(
|
||||||
|
"ID {} {}% | DocId: {}".format(
|
||||||
|
match.rank + 1,
|
||||||
|
match.percent,
|
||||||
|
docid_to_node[str(match.document.get_docid())],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
if len(sys.argv) != 3:
|
||||||
|
print("Brain-Query")
|
||||||
|
print("Usage: {} <path> <query>".format(sys.argv[0]))
|
||||||
|
exit(0)
|
||||||
|
main(sys.argv[1], sys.argv[2])
|
@ -6,6 +6,8 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import uuid
|
import uuid
|
||||||
|
import xapian
|
||||||
|
import shutil
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import org_rw
|
import org_rw
|
||||||
@ -173,8 +175,49 @@ def main(src_top, dest_top):
|
|||||||
source = template.read()
|
source = template.read()
|
||||||
f.write(source.replace('<!-- REPLACE_THIS_WITH_GRAPH -->',
|
f.write(source.replace('<!-- REPLACE_THIS_WITH_GRAPH -->',
|
||||||
json.dumps(graph)))
|
json.dumps(graph)))
|
||||||
|
|
||||||
logging.info("Generated {} files".format(files_generated))
|
logging.info("Generated {} files".format(files_generated))
|
||||||
|
|
||||||
|
# Generate index files
|
||||||
|
t0 = datetime.utcnow()
|
||||||
|
logging.info("Generating text index...")
|
||||||
|
|
||||||
|
xapian_db = os.path.join(dest_top, "xapian")
|
||||||
|
if os.path.exists(xapian_db):
|
||||||
|
shutil.rmtree(xapian_db)
|
||||||
|
db = xapian.WritableDatabase(xapian_db, xapian.DB_CREATE)
|
||||||
|
|
||||||
|
indexer = xapian.TermGenerator()
|
||||||
|
stemmer = xapian.Stem("english")
|
||||||
|
indexer.set_stemmer(stemmer)
|
||||||
|
|
||||||
|
docid_to_node = {}
|
||||||
|
|
||||||
|
for doc in docs:
|
||||||
|
relpath = os.path.relpath(doc.path, src_top)
|
||||||
|
|
||||||
|
if not relpath.startswith("public/"):
|
||||||
|
# print("Skip:", relpath)
|
||||||
|
continue
|
||||||
|
|
||||||
|
changed = False
|
||||||
|
for hl in doc.getAllHeadlines():
|
||||||
|
xapian_doc = xapian.Document()
|
||||||
|
content = "\n".join(doc.dump_headline(hl))
|
||||||
|
|
||||||
|
xapian_doc.set_data(content)
|
||||||
|
indexer.set_document(xapian_doc)
|
||||||
|
indexer.index_text(content)
|
||||||
|
|
||||||
|
doc_id = db.add_document(xapian_doc)
|
||||||
|
docid_to_node[doc_id] = { 'hl': hl.id, 'doc': doc.path }
|
||||||
|
|
||||||
|
docid_map_path = os.path.join(xapian_db, "docid_map.json")
|
||||||
|
with open(docid_map_path, 'wt') as f:
|
||||||
|
json.dump(docid_to_node, f)
|
||||||
|
|
||||||
|
logging.info("Text index generated in {}".format(datetime.utcnow() - t0))
|
||||||
|
|
||||||
|
|
||||||
def print_tree(tree, indentation=0):
|
def print_tree(tree, indentation=0):
|
||||||
return
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user