WIP: Save blog & notes data on FTS search DB.

This commit is contained in:
Sergio Martínez Portela 2023-10-04 00:19:39 +02:00
parent b8eadc8b1e
commit 6d621ffc3c
2 changed files with 49 additions and 10 deletions

View File

@ -22,6 +22,7 @@ import shutil
import traceback import traceback
import time import time
import re import re
import sqlite3
from typing import List from typing import List
from bs4 import BeautifulSoup as bs4 from bs4 import BeautifulSoup as bs4
@ -63,6 +64,7 @@ JINJA_ENV = jinja2.Environment(
autoescape=jinja2.select_autoescape() autoescape=jinja2.select_autoescape()
) )
PARSER_NAMESPACE = 'codigoparallevar.com/blog'
WATCH = True WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0': if os.getenv('WATCH_AND_REBUILD', '1') == '0':
WATCH = False WATCH = False
@ -176,6 +178,12 @@ def get_out_path(front_matter):
return out_path return out_path
def create_db(path):
db = sqlite3.connect(path)
db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url, tokenize="trigram");')
db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
return db
def load_all(top_dir_relative): def load_all(top_dir_relative):
top = os.path.abspath(top_dir_relative) top = os.path.abspath(top_dir_relative)
@ -456,10 +464,39 @@ def render_rss(docs, dest_top):
f.write(result) f.write(result)
def regen_all(source_top, dest_top, docs=None): def regen_all(source_top, dest_top, docs=None, db=None):
if docs is None: if docs is None:
docs = load_all(source_top) docs = load_all(source_top)
cur = db.cursor()
cleaned_db = False
try:
cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
cleaned_db = True
except sqlite3.OperationalError as err:
if WATCH:
logging.warning("Error pre-cleaning DB, search won't be updated")
else:
raise
# Save posts to DB
for (doc, front_matter, out_path) in docs.values():
cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
(
out_path,
front_matter['title'],
doc,
front_matter['title'],
False,
False,
PARSER_NAMESPACE,
out_path + '/index.html',
))
cur.close()
db.commit()
# Render posts # Render posts
for (doc, front_matter, out_path) in docs.values(): for (doc, front_matter, out_path) in docs.values():
doc_full_path = os.path.join(dest_top, out_path) doc_full_path = os.path.join(dest_top, out_path)
@ -513,7 +550,8 @@ def main(source_top, dest_top):
## Initial load ## Initial load
t0 = time.time() t0 = time.time()
logging.info("Initial load...") logging.info("Initial load...")
docs = regen_all(source_top, dest_top) db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
docs = regen_all(source_top, dest_top, db=db)
logging.info("Initial load completed in {:.2f}s".format(time.time() - t0)) logging.info("Initial load completed in {:.2f}s".format(time.time() - t0))
if not WATCH: if not WATCH:
@ -557,7 +595,7 @@ def main(source_top, dest_top):
if is_static_resource: if is_static_resource:
logging.info("Updated static resources in {:.2f}s".format(time.time() - t0)) logging.info("Updated static resources in {:.2f}s".format(time.time() - t0))
else: else:
docs = regen_all(source_top, dest_top, docs) docs = regen_all(source_top, dest_top, docs, db=db)
logging.info("Updated all in {:.2f}s".format(time.time() - t0)) logging.info("Updated all in {:.2f}s".format(time.time() - t0))
else: else:

View File

@ -46,6 +46,7 @@ IMG_EXTENSIONS = set([
"gif", "gif",
]) ])
SKIPPED_TAGS = set(['attach']) SKIPPED_TAGS = set(['attach'])
PARSER_NAMESPACE = 'codigoparallevar.com/notes'
WATCH = True WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0': if os.getenv('WATCH_AND_REBUILD', '1') == '0':
@ -88,11 +89,9 @@ def is_git_path(path):
return any([chunk == ".git" for chunk in path.split(os.sep)]) return any([chunk == ".git" for chunk in path.split(os.sep)])
def create_db(path): def create_db(path):
if os.path.exists(path):
os.unlink(path)
db = sqlite3.connect(path) db = sqlite3.connect(path)
db.execute('CREATE VIRTUAL TABLE note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, tokenize="trigram");') db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url tokenize="trigram");')
db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
return db return db
def load_all(top_dir_relative): def load_all(top_dir_relative):
@ -126,7 +125,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
cleaned_db = False cleaned_db = False
try: try:
cur.execute('DELETE FROM note_search;') cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
cleaned_db = True cleaned_db = True
except sqlite3.OperationalError as err: except sqlite3.OperationalError as err:
if WATCH: if WATCH:
@ -262,7 +261,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
topLevelHeadline = topLevelHeadline.parent topLevelHeadline = topLevelHeadline.parent
# Save for full-text-search # Save for full-text-search
cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo) VALUES (?, ?, ?, ?, ?, ?);''', cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
( (
headline.id, headline.id,
headline.title.get_text(), headline.title.get_text(),
@ -270,6 +269,8 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
topLevelHeadline.title.get_text(), topLevelHeadline.title.get_text(),
headline.is_done, headline.is_done,
headline.is_todo, headline.is_todo,
PARSER_NAMESPACE,
headline.id + '.node.html',
)) ))
# Update graph, replace document ids with headline ids # Update graph, replace document ids with headline ids
@ -356,7 +357,7 @@ def main(src_top, dest_top):
t0 = time.time() t0 = time.time()
os.makedirs(dest_top, exist_ok=True) os.makedirs(dest_top, exist_ok=True)
db = create_db(os.path.join(dest_top, 'db.sqlite3')) db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
docs = regen_all(src_top, dest_top, db=db) docs = regen_all(src_top, dest_top, db=db)
if not WATCH: if not WATCH: