From 16657d95ee7a5f7abe0ff25c84edcfade8798162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Sat, 4 Jun 2022 19:24:02 +0200 Subject: [PATCH] Test generating and querying Xapian indexes. --- scripts/brain-query | 40 ++++++++++++++++++++++++++++++++++++++++ scripts/generate.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100755 scripts/brain-query diff --git a/scripts/brain-query b/scripts/brain-query new file mode 100755 index 0000000..65c310e --- /dev/null +++ b/scripts/brain-query @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import xapian +import sys +import os +import json + +def main(path, query): + db = xapian.Database(path, xapian.DB_OPEN) + docid_map_path = os.path.join(path, "docid_map.json") + with open(docid_map_path, 'rt') as f: + docid_to_node = json.load(f) + + qp = xapian.QueryParser() + stemmer = xapian.Stem("english") + qp.set_stemmer(stemmer) + qp.set_database(db) + qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) + xap_query = qp.parse_query(query) + print("Parsed query is: {}".format(xap_query)) + + enquire = xapian.Enquire(db) + enquire.set_query(xap_query) + matches = enquire.get_mset(0, 10) + for match in matches: + print( + "ID {} {}% | DocId: {}".format( + match.rank + 1, + match.percent, + docid_to_node[str(match.document.get_docid())], + ) + ) + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("Brain-Query") + print("Usage: {} ".format(sys.argv[0])) + exit(0) + main(sys.argv[1], sys.argv[2]) diff --git a/scripts/generate.py b/scripts/generate.py index 05c3f63..004427d 100644 --- a/scripts/generate.py +++ b/scripts/generate.py @@ -6,6 +6,8 @@ import logging import os import sys import uuid +import xapian +import shutil from datetime import datetime import org_rw @@ -173,8 +175,49 @@ def main(src_top, dest_top): source = template.read() f.write(source.replace('', json.dumps(graph))) + logging.info("Generated {} files".format(files_generated)) + # Generate index files + t0 = datetime.utcnow() + logging.info("Generating text index...") + + xapian_db = os.path.join(dest_top, "xapian") + if os.path.exists(xapian_db): + shutil.rmtree(xapian_db) + db = xapian.WritableDatabase(xapian_db, xapian.DB_CREATE) + + indexer = xapian.TermGenerator() + stemmer = xapian.Stem("english") + indexer.set_stemmer(stemmer) + + docid_to_node = {} + + for doc in docs: + relpath = os.path.relpath(doc.path, src_top) + + if not relpath.startswith("public/"): + # print("Skip:", relpath) + continue + + changed = False + for hl in doc.getAllHeadlines(): + xapian_doc = xapian.Document() + content = "\n".join(doc.dump_headline(hl)) + + xapian_doc.set_data(content) + indexer.set_document(xapian_doc) + indexer.index_text(content) + + doc_id = db.add_document(xapian_doc) + docid_to_node[doc_id] = { 'hl': hl.id, 'doc': doc.path } + + docid_map_path = os.path.join(xapian_db, "docid_map.json") + with open(docid_map_path, 'wt') as f: + json.dump(docid_to_node, f) + + logging.info("Text index generated in {}".format(datetime.utcnow() - t0)) + def print_tree(tree, indentation=0): return