new-codigoparallevar/scripts/generate.py

#!/usr/bin/env python3

import sqlite3
import time
import json
import html
import logging
import os
import sys
import uuid
from datetime import datetime
import traceback
import re

import inotify.adapters

import org_rw
from org_rw import OrgTime, dom, Link
from org_rw import dump as dump_org
from org_rw import load as load_org
from org_rw import token_list_to_raw

EXTENSIONS = [
    ".org",
    ".org.txt",
]

WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0':
    WATCH = False

MIN_HIDDEN_HEADLINE_LEVEL = 2
INDEX_ID = "ea48ec1d-f9d4-4fb7-b39a-faa7b6e2ba95"
SITE_NAME = "Código para llevar"

MONITORED_EVENT_TYPES = (
    'IN_CREATE',
    # 'IN_MODIFY',
    'IN_CLOSE_WRITE',
    'IN_DELETE',
    'IN_MOVED_FROM',
    'IN_MOVED_TO',
    'IN_DELETE_SELF',
    'IN_MOVE_SELF',
)

TEXT_OR_LINK_RE = re.compile(r'([^\s\[\]]+|.)')

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

STATIC_PATH = os.path.join(ROOT_DIR, 'static')

def is_git_path(path):
    return any([chunk == ".git" for chunk in path.split(os.sep)])

def create_db(path):
    if os.path.exists(path):
        os.unlink(path)

    db = sqlite3.connect(path)
    db.execute('CREATE VIRTUAL TABLE note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo);')
    return db

def load_all(top_dir_relative):
    top = os.path.abspath(top_dir_relative)

    docs = []

    for root, dirs, files in os.walk(top):
        for name in files:
            if ".org" not in name:
                continue

            path = os.path.join(root, name)

            try:
                doc = load_org(open(path), extra_cautious=True)
                docs.append(doc)
            except Exception as err:
                import traceback

                traceback.print_exc()
                print(f"== On {path}")
                sys.exit(1)

    logging.info("Collected {} files".format(len(docs)))
    return docs

def regen_all(src_top, dest_top, *, docs=None, db=None):
    files_generated = 0
    cur = db.cursor()
    cur.execute('DELETE FROM note_search;')

    docs = load_all(src_top)
    doc_to_headline_remapping = {}

    os.makedirs(dest_top, exist_ok=True)

    ## Build headline list
    # This includes a virtual headline for ID-referenced documents.
    all_headlines = []
    main_headlines_by_path = {}
    for doc in docs:
        relpath = os.path.relpath(doc.path, src_top)
        changed = False
        headlines = list(doc.getAllHeadlines())
        related = None
        if not relpath.startswith("public/"):
            # print("Skip:", relpath)
            continue

        i = len(headlines)
        while i > 0:
            i -= 1
            headline = headlines[i]
            if headline.title.get_text().strip().lower() == "related" and headline.depth == 1:
                if related is not None:
                    print(
                        "Found duplicated related: {} vs {}".format(
                            related.id, headline.id
                        )
                    )
                    assert related is None
                related = headline
                headlines.pop(i)

        for headline in headlines:
            if headline.id is None:
                headline.id = str(uuid.uuid4())
                changed = True

        if changed:
            print("Updated", relpath)
            save_changes(doc)

        all_headlines.extend(headlines)
        main_headline = None
        topHeadlines = doc.getTopHeadlines()

        if ((len(topHeadlines) == 1 and related is None)
            or (len(topHeadlines) == 2 and related is not None)):

            main_headline = [h for h in topHeadlines if h != related][0]
            main_headlines_by_path[doc.path] = main_headline
            if doc.id is not None:
                doc_to_headline_remapping['id:' + doc.id] = 'id:' + main_headline.id
                files_generated += 1
        elif doc.id is not None:
            logging.error("Cannot render document from id: {}. {} headlines {} related".format(
                relpath,
                len(topHeadlines),
                'with' if related is not None else 'without'
            ))

    graph = {}
    # Build graph
    for headline in all_headlines:
        links = []
        headline_links = list(headline.get_links())
        if headline == main_headline and related is not None:
            headline_links.extend(list(related.get_links()))

        for l in headline_links:
            if l.value.startswith('http://') or l.value.startswith('https://'):
                pass # Ignore for now, external URL
            elif l.value.startswith('id:'):
                links.append({'target': l.value})
            elif l.value.startswith('attachment:'):
                pass # Ignore, attachment
            elif l.value.startswith('file:'):
                pass # Ignore, attachment
            elif l.value.startswith('notmuch:'):
                pass # Ignore, mail
            elif l.value.startswith('orgit-rev:'):
                pass # Ignore, mail
            elif l.value.startswith('*'):
                pass # Ignore, internal
            elif not ':' in l.value.split()[0]:
                pass # Ignore, internal
            elif l.value.startswith('./'):
                pass # TODO: Properly handle
            else:
                logging.warning('On document {}, unknown link to {}'.format(doc.path, l.value))

        if headline.parent:
            if isinstance(headline.parent, org_rw.Headline):
                links.append({
                    "target": headline.parent.id,
                    "relation": "in"
                })
        graph[headline.id] = {
            "title": org_rw.org_rw.token_list_to_plaintext(headline.title.contents).strip(),
            "links": links,
            "depth": headline.depth,
        }

        topLevelHeadline = headline
        while isinstance(topLevelHeadline.parent, org_rw.Headline):
            topLevelHeadline = topLevelHeadline.parent

        # Save for full-text-search
        cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo) VALUES (?, ?, ?, ?, ?, ?);''',
                    (
                        headline.id,
                        headline.title.get_text(),
                        ''.join(headline.get_contents('raw')),
                        topLevelHeadline.title.get_text(),
                        headline.is_done,
                        headline.is_todo,
                    ))

    # Update graph, replace document ids with headline ids
    for headline_data in graph.values():
        for link in headline_data['links']:
            if link['target'] in doc_to_headline_remapping:
                link['target'] = doc_to_headline_remapping[link['target']]

    # Render docs after we've built the graph
    # Render main headlines
    full_graph_info = { "nodes": graph, "main_headlines": main_headlines_by_path }
    for _docpath, main_headline in main_headlines_by_path.items():
        if main_headline.doc.id:
            endpath = os.path.join(dest_top, main_headline.doc.id + ".node.html")
            with open(endpath, "wt") as f:
                f.write(render_as_document(main_headline, main_headline.doc, headlineLevel=0, graph=full_graph_info,
                                           title=org_rw.token_list_to_plaintext(main_headline.title.contents)))


    # Render all headlines
    for headline in all_headlines:
        endpath = os.path.join(dest_top, headline.id + ".node.html")

        # Render HTML
        with open(endpath, "wt") as f:
            f.write(render_as_document(headline, headline.doc, headlineLevel=0, graph=full_graph_info,
                                       title=org_rw.token_list_to_plaintext(headline.title.contents)))
            files_generated += 1

        if headline.id == INDEX_ID:
            index_endpath = os.path.join(dest_top, "index.html")
            with open(index_endpath, "wt") as f:
                f.write(render_as_document(headline, headline.doc, headlineLevel=0, graph=full_graph_info,
                                           title=org_rw.token_list_to_plaintext(headline.title.contents)))
                files_generated += 1

    # Output graph files
    graphpath = os.path.join(dest_top, "graph.json")
    graph_explorer_path = os.path.join(dest_top, "graph.html")
    with open(graphpath, "wt") as f:
        json.dump(obj=graph, fp=f, indent=2)
    graph_explorer_path = os.path.join(dest_top, "graph.html")
    with open(graph_explorer_path, 'wt') as f:
        with open(os.path.join(os.path.dirname(os.path.abspath(dest_top)), '..', 'static', 'graph_explorer.html'), 'rt') as template:
            source = template.read()
        f.write(source.replace('<!-- REPLACE_THIS_WITH_GRAPH -->',
                               json.dumps(graph)))
    logging.info("Generated {} files".format(files_generated))
    cur.close()
    db.commit()

def main(src_top, dest_top):
    notifier = inotify.adapters.InotifyTrees([src_top, STATIC_PATH])

    ## Initial load
    t0 = time.time()

    os.makedirs(dest_top, exist_ok=True)
    db = create_db(os.path.join(dest_top, 'db.sqlite3'))
    docs = regen_all(src_top, dest_top, db=db)

    if not WATCH:
        logging.info("Build completed in {:.2f}s".format(time.time() - t0))
        return 0

    logging.info("Initial load completed in {:.2f}s".format(time.time() - t0))
    ## Updating
    for event in notifier.event_gen(yield_nones=False):
        (ev, types, directory, file) = event
        if not any([type in MONITORED_EVENT_TYPES for type in types]):
            continue
        if is_git_path(directory):
            continue
        filepath = os.path.join(directory, file)
        print("CHANGED: {}".format(filepath))
        t0 = time.time()
        try:
            docs = regen_all(src_top, dest_top, docs=docs, db=db)
        except:
            logging.error(traceback.format_exc())
            logging.error("Loading new templates failed 😿")
            continue
        logging.info("Updated all in {:.2f}s".format(time.time() - t0))

def get_headline_with_name(target_name, doc):
    target_name = target_name.strip()
    for headline in doc.getAllHeadlines():
        if headline.title.get_text().strip() == target_name:
            return headline

    return None

def print_tree(tree, indentation=0, headline=None):
    # if headline and headline.id != INDEX_ID:
    #     return
    return
    for element in tree:
        if "children" in dir(element):
            if len(element.children) > 0:
                print_element(element.children, indentation + 1, headline)
                print()

        elif "content" in dir(element):
            for content in element.content:
                print_element(content, indentation + 1, headline)

def print_element(element, indentation, headline):
    if isinstance(element, org_rw.Link):
        print(" " * indentation, "Link:", element.get_raw())
    elif isinstance(element, str):
        print(" " * indentation, "{" + element + "}", type(element))
    else:
        print_tree(element, indentation, headline)


def render_property_drawer(element, acc, headline, graph):
    pass


def render_logbook_drawer(element, acc, headline, graph):
    pass


def render_property_node(element, acc, headline, graph):
    pass


def render_list_group(element, acc, headline, graph):
    acc.append("<ul>")
    render_tree(element.children, acc, headline, graph)
    acc.append("</ul>")

def render_table(element, acc, graph, headline):
    acc.append("<table>")
    render_tree(element.children, acc, headline, graph)
    acc.append("</table>")

def render_table_row(element, acc, headline, graph):
    acc.append("<tr>")
    for cell in element.cells:
        acc.append("<td>")
        acc.append(html.escape(cell))
        acc.append("</td>")
    acc.append("</tr>")

def render_table_separator_row(element, acc, headline, graph):
    acc.append("<tr class='__table-separator'></tr>")

def render_list_item(element, acc, headline, graph):
    acc.append("<li>")
    if element.tag is not None:
        acc.append("<span class='tag'>")
        render_text_tokens(element.tag, acc, headline, graph)
        acc.append("</span>")

    acc.append("<span class='item'>")
    render_text_tokens(element.content, acc, headline, graph)
    acc.append("</span></li>")


def render_code_block(element, acc, headline, graph):
    acc.append('<pre class="{}"><code>'.format(element.subtype.lower()))
    content = html.escape(element.lines)

    # Remove indentation common to all lines
    base_indentation = min([
        len(l) - len(l.lstrip(' '))
        for l in content.split('\n')
        if len(l.strip()) > 0
    ])
    content_lines = [
        l[base_indentation:]
        for l in content.split('\n')
    ]

    acc.append('\n'.join(content_lines))
    acc.append('</code></pre>')

def render_results_block(element, acc, headline, graph):
    # TODO:
    # acc.append('<pre class="results"><code>')
    # render_tree(element.children, acc)
    # acc.append('</code></pre>')
    pass

def render_org_text(element, acc, headline, graph):
    as_dom = org_rw.text_to_dom(element.contents, element)
    render_text_tokens(as_dom, acc, headline, graph)

def render_text(element, acc, headline, graph):
    acc.append('<div class="text">')
    render_text_tokens(element.content, acc, headline, graph)
    acc.append('</div>')

def render_text_tokens(tokens, acc, headline, graph):
    acc.append('<p>')
    for chunk in tokens:
        if isinstance(chunk, str):
            lines = chunk.split('\n\n')
            contents = []
            for line in lines:
                line_chunks = []
                for word in TEXT_OR_LINK_RE.findall(line):
                    if ':/' in word and not (word.startswith('org-protocol://')):
                        if not (word.startswith('http://')
                                or word.startswith('https://')
                                or word.startswith('ftp://')
                                or word.startswith('ftps://')
                                ):
                            raise Exception('Is this a link? {} (on {})\nLine: {}\nChunks: {}'.format(word, headline.doc.path, line, chunks))
                        line_chunks.append('<a href="{url}">{description}</a>'
                                           .format(url=word,
                                                   description=html.escape(word)))
                    else:
                        line_chunks.append(html.escape(word))
                contents.append(''.join(line_chunks))

            acc.append('<span class="line">{}</span>'.format('</p><p>'.join(contents)))

        elif isinstance(chunk, Link):
            link_target = chunk.value
            if link_target.startswith('id:'):
                link_target = './' + link_target[3:] + '.node.html'
            elif link_target.startswith('./') or link_target.startswith('../'):
                if '::' in link_target:
                    logging.warn('Not implemented headline links to other files. Used on {}'.format(link_target))

                else:
                    target_path = os.path.abspath(os.path.join(os.path.dirname(headline.doc.path), link_target))
                    if target_path not in graph['main_headlines']:
                        logging.warn('Link to doc not in graph: {}'.format(target_path))
                    else:
                        link_target = './' + graph['main_headlines'][target_path].id + '.node.html'
            elif link_target.startswith('attachment:'):
                logging.warn('Not implemented `attachment:` links. Used on {}'.format(link_target))
            elif link_target.startswith('* '):
                target_headline = get_headline_with_name(link_target.lstrip('* '), headline.doc)
                if target_headline is None:
                    logging.warn('No headline found corresponding to {}. On file {}'.format(link_target, headline.doc.path))
                else:
                    link_target = './' + target_headline.id + '.node.html'
            else:
                if not (
                    link_target.startswith('https://')
                    or link_target.startswith('http://')
                    or link_target.startswith('/')
                ):
                    raise NotImplementedError('Unknown link type: {}'
                                              .format(link_target))
            description = chunk.description
            if description is None:
                description = chunk.value

            acc.append('<a href="{}">{}</a>'.format(
                html.escape(link_target),
                html.escape(description),
            ))
        # else:
        #     raise NotImplementedError('TextToken: {}'.format(chunk))
    acc.append('</p>')


def render_tag(element, acc, headline, graph):
    return {
        dom.PropertyDrawerNode: render_property_drawer,
        dom.LogbookDrawerNode: render_logbook_drawer,
        dom.PropertyNode: render_property_node,
        dom.ListGroupNode: render_list_group,
        dom.ListItem: render_list_item,
        dom.TableNode: render_table,
        dom.TableSeparatorRow: render_table_separator_row,
        dom.TableRow: render_table_row,
        dom.CodeBlock: render_code_block,
        dom.Text: render_text,
        dom.ResultsDrawerNode: render_results_block,
        org_rw.Text: render_org_text,
    }[type(element)](element, acc, headline, graph)


def render_tree(tree, acc, headline, graph):
    for element in tree:
        render_tag(element, acc, headline, graph)

def render_inline(tree, f, headline, graph):
    acc = []
    f(tree, acc, headline, graph)
    return ''.join(acc)


def render_as_document(headline, doc, headlineLevel, graph, title):
    if isinstance(headline.parent, org_rw.Headline):
        topLevelHeadline = headline.parent
        while isinstance(topLevelHeadline.parent, org_rw.Headline):
            topLevelHeadline = topLevelHeadline.parent
        return f"""<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <title>{title} @ {SITE_NAME}</title>
    <meta http-equiv="refresh" content="0;./{topLevelHeadline.id}.node.html#{headline.id}" />
    <link href="../static/style.css" rel="stylesheet"/>
  </head>
  <body>
    <nav>
      <h1><a href="./index.html">Código para llevar [Notes]</a></h1>
    </nav>
    <a href='./{topLevelHeadline.id}.node.html#{headline.id}'>Sending you to the main note... [{org_rw.token_list_to_plaintext(topLevelHeadline.title.contents)}]</a>
  </body>
</html>
        """
    else:
        return as_document(render(headline, doc, graph=graph, headlineLevel=headlineLevel), title)

def render(headline, doc, graph, headlineLevel):
    try:
        dom = headline.as_dom()
    except:
        logging.error("Error generating DOM for {}".format(doc.path))
        raise
    print_tree(dom, indentation=2, headline=headline)

    content = []
    render_tree(dom, content, headline, graph)
    for child in headline.children:
        content.append(render(child, doc, headlineLevel=headlineLevel+1, graph=graph))

    if headline.state is None:
        state = ""
    else:
        state = f'<span class="state todo-{headline.is_todo} state-{headline.state}">{headline.state}</span>'

    if headline.is_todo:
        todo_state = "todo"
    else:
        todo_state = "done"

    tag_list = []
    for tag in headline.shallow_tags:
        tag_list.append(f'<span class="tag">{html.escape(tag)}</span>')
    tags = f'<span class="tags">{"".join(tag_list)}</span>'

    # display_state = 'collapsed'
    # if headlineLevel < MIN_HIDDEN_HEADLINE_LEVEL:
    #     display_state = 'expanded'
    display_state = 'expanded'

    title = render_inline(headline.title, render_tag, headline, graph)

    if headlineLevel > 0:
        title = f"<a href=\"javascript:toggle_expand('{html.escape(headline.id)}')\">{title}</a>"

    return f"""
<div id="{html.escape(headline.id)}" class="node {todo_state} {display_state}">
  <h1 class="title">
    {state}
    {title}
    {tags}
  </h1>
  <div class='contents'>
    {''.join(content)}
  </div>
</div>
"""


def as_document(html, title):
    return f"""<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <title>{title} @ {SITE_NAME}</title>
    <link href="../static/style.css" rel="stylesheet"/>
    <script type="text/javascript">
      function toggle_expand(header_id) {{
        var e = document.getElementById(header_id);
        if (e.classList.contains('expanded')) {{
          e.classList.add('collapsed');
          e.classList.remove('expanded');
        }}
        else {{
          e.classList.add('expanded');
          e.classList.remove('collapsed');
        }}
      }}
    </script>
  </head>
  <body>
    <nav>
      <h1><a href="./index.html">Código para llevar [Notes]</a></h1>
      <input type="text" id="searchbox" disabled="true" placeholder="Search (requires JS)" />
    </nav>
    {html}

    <script src="../static/search-box.js"></script>
    <script tye="text/javascript">_codigoparallevar_enable_search_box('#searchbox', {{placeholder: 'Search...'}})</script>
  </body>
</html>
    """


def save_changes(doc):
    assert doc.path is not None
    with open(doc.path, "wt") as f:
        dump_org(doc, f)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: {} SOURCE_TOP DEST_TOP".format(sys.argv[0]))
        exit(0)

    logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
    exit(main(sys.argv[1], sys.argv[2]))