WIP: Save blog & notes data on FTS search DB.

2023-10-04 00:19:39 +02:00
2 changed files with 49 additions and 10 deletions
--- a/scripts/blog.py
+++ b/scripts/blog.py
@ -22,6 +22,7 @@ import shutil
 import traceback
 import time
 import re
+import sqlite3
 from typing import List

 from bs4 import BeautifulSoup as bs4
@ -63,6 +64,7 @@ JINJA_ENV = jinja2.Environment(
    autoescape=jinja2.select_autoescape()
 )

+PARSER_NAMESPACE = 'codigoparallevar.com/blog'
 WATCH = True
 if os.getenv('WATCH_AND_REBUILD', '1') == '0':
    WATCH = False
@ -176,6 +178,12 @@ def get_out_path(front_matter):
    return out_path


+def create_db(path):
+    db = sqlite3.connect(path)
+    db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url,  tokenize="trigram");')
+    db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
+    return db
+
 def load_all(top_dir_relative):
    top = os.path.abspath(top_dir_relative)

@ -456,10 +464,39 @@ def render_rss(docs, dest_top):
        f.write(result)


-def regen_all(source_top, dest_top, docs=None):
+def regen_all(source_top, dest_top, docs=None, db=None):
    if docs is None:
        docs = load_all(source_top)

+    cur = db.cursor()
+    cleaned_db = False
+
+    try:
+        cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
+        cleaned_db = True
+    except sqlite3.OperationalError as err:
+        if WATCH:
+            logging.warning("Error pre-cleaning DB, search won't be updated")
+        else:
+            raise
+
+    # Save posts to DB
+    for (doc, front_matter, out_path) in docs.values():
+        cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
+                    (
+                        out_path,
+                        front_matter['title'],
+                        doc,
+                        front_matter['title'],
+                        False,
+                        False,
+                        PARSER_NAMESPACE,
+                        out_path + '/index.html',
+                    ))
+
+    cur.close()
+    db.commit()
+
    # Render posts
    for (doc, front_matter, out_path) in docs.values():
        doc_full_path = os.path.join(dest_top, out_path)
@ -513,7 +550,8 @@ def main(source_top, dest_top):
    ## Initial load
    t0 = time.time()
    logging.info("Initial load...")
-    docs = regen_all(source_top, dest_top)
+    db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
+    docs = regen_all(source_top, dest_top, db=db)
    logging.info("Initial load completed in {:.2f}s".format(time.time() - t0))

    if not WATCH:
@ -557,7 +595,7 @@ def main(source_top, dest_top):
            if is_static_resource:
                logging.info("Updated static resources in {:.2f}s".format(time.time() - t0))
            else:
-                docs = regen_all(source_top, dest_top, docs)
+                docs = regen_all(source_top, dest_top, docs, db=db)
                logging.info("Updated all in {:.2f}s".format(time.time() - t0))

        else:
--- a/scripts/generate.py
+++ b/scripts/generate.py
@ -46,6 +46,7 @@ IMG_EXTENSIONS = set([
    "gif",
 ])
 SKIPPED_TAGS = set(['attach'])
+PARSER_NAMESPACE = 'codigoparallevar.com/notes'

 WATCH = True
 if os.getenv('WATCH_AND_REBUILD', '1') == '0':
@ -88,11 +89,9 @@ def is_git_path(path):
    return any([chunk == ".git" for chunk in path.split(os.sep)])

 def create_db(path):
-    if os.path.exists(path):
-        os.unlink(path)
-
    db = sqlite3.connect(path)
-    db.execute('CREATE VIRTUAL TABLE note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, tokenize="trigram");')
+    db.execute('CREATE VIRTUAL TABLE IF NOT EXISTS note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url tokenize="trigram");')
+    db.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
    return db

 def load_all(top_dir_relative):
@ -126,7 +125,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
    cleaned_db = False

    try:
-        cur.execute('DELETE FROM note_search;')
+        cur.execute('DELETE FROM note_search WHERE parser_namespace = ?;', (PARSER_NAMESPACE,))
        cleaned_db = True
    except sqlite3.OperationalError as err:
        if WATCH:
@ -262,7 +261,7 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
            topLevelHeadline = topLevelHeadline.parent

        # Save for full-text-search
-        cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo) VALUES (?, ?, ?, ?, ?, ?);''',
+        cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo, parser_namespace, url) VALUES (?, ?, ?, ?, ?, ?, ?, ?);''',
                    (
                        headline.id,
                        headline.title.get_text(),
@ -270,6 +269,8 @@ def regen_all(src_top, dest_top, *, docs=None, db=None):
                        topLevelHeadline.title.get_text(),
                        headline.is_done,
                        headline.is_todo,
+                        PARSER_NAMESPACE,
+                        headline.id + '.node.html',
                    ))

    # Update graph, replace document ids with headline ids
@ -356,7 +357,7 @@ def main(src_top, dest_top):
    t0 = time.time()

    os.makedirs(dest_top, exist_ok=True)
-    db = create_db(os.path.join(dest_top, 'db.sqlite3'))
+    db = create_db(os.path.join(dest_top, '..', 'db.sqlite3'))
    docs = regen_all(src_top, dest_top, db=db)

    if not WATCH: