WIP: Save blog & notes data on FTS search DB.

This commit is contained in:
Sergio Martínez Portela 2023-10-04 00:19:39 +02:00
parent b8eadc8b1e
commit 6d621ffc3c
2 changed files with 49 additions and 10 deletions

View File

@ -22,6 +22,7 @@ import shutil
import traceback
import time
import re
import sqlite3
from typing import List
from bs4 import BeautifulSoup as bs4
@ -63,6 +64,7 @@ JINJA_ENV = jinja2.Environment(
autoescape=jinja2.select_autoescape()
)
PARSER_NAMESPACE = 'codigoparallevar.com/blog'
WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0':
WATCH = False
@ -176,6 +178,12 @@ def get_out_path(front_matter):
return out_path
def create_db(path):
db = sqlite3.connect(path)
db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url, tokenize="trigram");')
db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
return db
def load_all(top_dir_relative):
top = os.path.abspath(top_dir_relative)
@ -456,10 +464,39 @@ def render_rss(docs, dest_top):
f.write(result)
def regen_all(source_top, dest_top, docs=None):
def regen_all(source_top, dest_top, docs=None, db=None):
if docs is None:
docs = load_all(source_top)
cur = db.cursor()
cleaned_db = False
try:
cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
cleaned_db = True
except sqlite3.OperationalError as err:
if WATCH:
logging.warning("Error pre-cleaning DB, search won't be updated")
else:
raise
# Save posts to DB
for (doc, front_matter, out_path) in docs.values():
cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
(
out_path,
front_matter['title'],
doc,
front_matter['title'],
False,
False,
PARSER_NAMESPACE,
out_path + '/index.html',
))
cur.close()
db.commit()
# Render posts
for (doc, front_matter, out_path) in docs.values():
doc_full_path = os.path.join(dest_top, out_path)
@ -513,7 +550,8 @@ def main(source_top, dest_top):
## Initial load
t0 = time.time()
logging.info("Initial load...")
docs = regen_all(source_top, dest_top)
db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
docs = regen_all(source_top, dest_top, db=db)
logging.info("Initial load completed in {:.2f}s".format(time.time() - t0))
if not WATCH:
@ -557,7 +595,7 @@ def main(source_top, dest_top):
if is_static_resource:
logging.info("Updated static resources in {:.2f}s".format(time.time() - t0))
else:
docs = regen_all(source_top, dest_top, docs)
docs = regen_all(source_top, dest_top, docs, db=db)
logging.info("Updated all in {:.2f}s".format(time.time() - t0))
else:

View File

@ -46,6 +46,7 @@ IMG_EXTENSIONS = set([
"gif",
])
SKIPPED_TAGS = set(['attach'])
PARSER_NAMESPACE = 'codigoparallevar.com/notes'
WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0':
@ -88,11 +89,9 @@ def is_git_path(path):
return any([chunk == ".git" for chunk in path.split(os.sep)])
def create_db(path):
if os.path.exists(path):
os.unlink(path)
db = sqlite3.connect(path)
db.execute('CREATE VIRTUAL TABLE note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, tokenize="trigram");')
db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url tokenize="trigram");')
db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
return db
def load_all(top_dir_relative):
@ -126,7 +125,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
cleaned_db = False
try:
cur.execute('DELETE FROM note_search;')
cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
cleaned_db = True
except sqlite3.OperationalError as err:
if WATCH:
@ -262,7 +261,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
topLevelHeadline = topLevelHeadline.parent
# Save for full-text-search
cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo) VALUES (?, ?, ?, ?, ?, ?);''',
cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
(
headline.id,
headline.title.get_text(),
@ -270,6 +269,8 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
topLevelHeadline.title.get_text(),
headline.is_done,
headline.is_todo,
PARSER_NAMESPACE,
headline.id + '.node.html',
))
# Update graph, replace document ids with headline ids
@ -356,7 +357,7 @@ def main(src_top, dest_top):
t0 = time.time()
os.makedirs(dest_top, exist_ok=True)
db = create_db(os.path.join(dest_top, 'db.sqlite3'))
db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
docs = regen_all(src_top, dest_top, db=db)
if not WATCH: