Compare commits

...

1 Commits

Author SHA1 Message Date
Sergio Martínez Portela
16657d95ee Test generating and querying Xapian indexes. 2022-06-04 19:24:02 +02:00
2 changed files with 83 additions and 0 deletions

40
scripts/brain-query Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env python3
import xapian
import sys
import os
import json
def main(path, query):
db = xapian.Database(path, xapian.DB_OPEN)
docid_map_path = os.path.join(path, "docid_map.json")
with open(docid_map_path, 'rt') as f:
docid_to_node = json.load(f)
qp = xapian.QueryParser()
stemmer = xapian.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(db)
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
xap_query = qp.parse_query(query)
print("Parsed query is: {}".format(xap_query))
enquire = xapian.Enquire(db)
enquire.set_query(xap_query)
matches = enquire.get_mset(0, 10)
for match in matches:
print(
"ID {} {}% | DocId: {}".format(
match.rank + 1,
match.percent,
docid_to_node[str(match.document.get_docid())],
)
)
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Brain-Query")
print("Usage: {} <path> <query>".format(sys.argv[0]))
exit(0)
main(sys.argv[1], sys.argv[2])

View File

@ -6,6 +6,8 @@ import logging
import os
import sys
import uuid
import xapian
import shutil
from datetime import datetime
import org_rw
@ -173,8 +175,49 @@ def main(src_top, dest_top):
source = template.read()
f.write(source.replace('<!-- REPLACE_THIS_WITH_GRAPH -->',
json.dumps(graph)))
logging.info("Generated {} files".format(files_generated))
# Generate index files
t0 = datetime.utcnow()
logging.info("Generating text index...")
xapian_db = os.path.join(dest_top, "xapian")
if os.path.exists(xapian_db):
shutil.rmtree(xapian_db)
db = xapian.WritableDatabase(xapian_db, xapian.DB_CREATE)
indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
indexer.set_stemmer(stemmer)
docid_to_node = {}
for doc in docs:
relpath = os.path.relpath(doc.path, src_top)
if not relpath.startswith("public/"):
# print("Skip:", relpath)
continue
changed = False
for hl in doc.getAllHeadlines():
xapian_doc = xapian.Document()
content = "\n".join(doc.dump_headline(hl))
xapian_doc.set_data(content)
indexer.set_document(xapian_doc)
indexer.index_text(content)
doc_id = db.add_document(xapian_doc)
docid_to_node[doc_id] = { 'hl': hl.id, 'doc': doc.path }
docid_map_path = os.path.join(xapian_db, "docid_map.json")
with open(docid_map_path, 'wt') as f:
json.dump(docid_to_node, f)
logging.info("Text index generated in {}".format(datetime.utcnow() - t0))
def print_tree(tree, indentation=0):
return