new-codigoparallevar/scripts/generate.py

#!/usr/bin/env python3

import sqlite3
import time
import json
import html
import logging
import os
import sys
import uuid
from datetime import datetime
import traceback
import re
from itertools import chain

import inotify.adapters

import org_rw
from org_rw import OrgTime, dom, Link
from org_rw import dump as dump_org
from org_rw import load as load_org
from org_rw import token_list_to_raw

import pygments
import pygments.lexers
import pygments.formatters

# Set custom states
for state in ("NEXT", "MEETING", "Q", "PAUSED", "SOMETIME", "TRACK", "WAITING"):
    org_rw.DEFAULT_TODO_KEYWORDS.append(state)

for state in ("DISCARDED", "VALIDATING"):
    org_rw.DEFAULT_DONE_KEYWORDS.append(state)

EXTENSIONS = [
    ".org",
    ".org.txt",
]

WATCH = True
if os.getenv('WATCH_AND_REBUILD', '1') == '0':
    WATCH = False

MIN_HIDDEN_HEADLINE_LEVEL = 2
INDEX_ID = "ea48ec1d-f9d4-4fb7-b39a-faa7b6e2ba95"
SITE_NAME = "Código para llevar"

MONITORED_EVENT_TYPES = (
    'IN_CREATE',
    # 'IN_MODIFY',
    'IN_CLOSE_WRITE',
    'IN_DELETE',
    'IN_MOVED_FROM',
    'IN_MOVED_TO',
    'IN_DELETE_SELF',
    'IN_MOVE_SELF',
)

TEXT_OR_LINK_RE = re.compile(r'([^\s\[\]]+|.)')

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

STATIC_PATH = os.path.join(ROOT_DIR, 'static')

class NonExistingLocalNoteError(AssertionError):
    def __init__(self, note_id, src_headline):
        AssertionError.__init__(self)
        self.note_id = note_id
        self.src_headline = src_headline

    def get_message(self):
        return ("Cannot follow link to '{}' on headline '{}' ({})"
                .format(self.note_id,
                        self.src_headline.id,
                        self.src_headline.title.get_text().strip()))

def is_git_path(path):
    return any([chunk == ".git" for chunk in path.split(os.sep)])

def create_db(path):
    if os.path.exists(path):
        os.unlink(path)

    db = sqlite3.connect(path)
    db.execute('CREATE VIRTUAL TABLE note_search USING fts5(note_id, title, body, top_level_title, is_done, is_todo, tokenize="trigram");')
    return db

def load_all(top_dir_relative):
    top = os.path.abspath(top_dir_relative)

    docs = []

    for root, dirs, files in os.walk(top):
        for name in files:
            if ".org" not in name:
                continue

            path = os.path.join(root, name)

            try:
                doc = load_org(open(path), extra_cautious=True)
                docs.append(doc)
            except Exception as err:
                import traceback

                traceback.print_exc()
                print(f"== On {path}")
                sys.exit(1)

    logging.info("Collected {} files".format(len(docs)))
    return docs

def regen_all(src_top, dest_top, *, docs=None, db=None):
    files_generated = 0
    cur = db.cursor()
    cleaned_db = False

    try:
        cur.execute('DELETE FROM note_search;')
        cleaned_db = True
    except sqlite3.OperationalError as err:
        logging.warning("Error pre-cleaning DB, search won't be updated")

    docs = load_all(src_top)
    doc_to_headline_remapping = {}

    os.makedirs(dest_top, exist_ok=True)

    ## Build headline list
    # This includes a virtual headline for ID-referenced documents.
    all_headlines = []
    main_headlines_by_path = {}
    main_headline_to_docid = {}
    for doc in docs:
        relpath = os.path.relpath(doc.path, src_top)
        changed = False
        headlines = list(doc.getAllHeadlines())
        related = None
        if not relpath.startswith("public/"):
            # print("Skip:", relpath)
            continue

        i = len(headlines)
        while i > 0:
            i -= 1
            headline = headlines[i]
            if headline.title.get_text().strip().lower() == "related" and headline.depth == 1:
                if related is not None:
                    print(
                        "Found duplicated related: {} vs {}".format(
                            related.id, headline.id
                        )
                    )
                    assert related is None
                related = headline
                headlines.pop(i)

        for headline in headlines:
            if headline.id is None:
                headline.id = str(uuid.uuid4())
                changed = True

        if changed:
            print("Updated", relpath)
            save_changes(doc)

        all_headlines.extend(headlines)
        main_headline = None
        topHeadlines = doc.getTopHeadlines()

        if ((len(topHeadlines) == 1 and related is None)
            or (len(topHeadlines) == 2 and related is not None)):

            main_headline = [h for h in topHeadlines if h != related][0]
            main_headlines_by_path[doc.path] = main_headline
            if doc.id is not None:
                doc_to_headline_remapping['id:' + doc.id] = 'id:' + main_headline.id
                main_headline_to_docid[main_headline.id] = doc.id
                files_generated += 1
        elif doc.id is not None:
            logging.error("Cannot render document from id: {}. {} headlines {} related".format(
                relpath,
                len(topHeadlines),
                'with' if related is not None else 'without'
            ))

    # Build graph
    graph = {}
    backlink_graph = {}
    for headline in all_headlines:
        links = []
        headline_links = list(headline.get_links())
        if headline == main_headline and related is not None:
            headline_links.extend(list(related.get_links()))

        for l in headline_links:
            if l.value.startswith('http://') or l.value.startswith('https://'):
                pass # Ignore for now, external URL
            elif l.value.startswith('id:'):
                links.append({'target': l.value})
            elif l.value.startswith('attachment:'):
                pass # Ignore, attachment
            elif l.value.startswith('file:'):
                pass # Ignore, attachment
            elif l.value.startswith('notmuch:'):
                pass # Ignore, mail
            elif l.value.startswith('orgit-rev:'):
                pass # Ignore, mail
            elif l.value.startswith('*'):
                pass # Ignore, internal
            elif not ':' in l.value.split()[0]:
                pass # Ignore, internal
            elif l.value.startswith('./'):
                pass # TODO: Properly handle
            else:
                logging.warning('On document {}, unknown link to {}'.format(doc.path, l.value))

        if headline.parent:
            if isinstance(headline.parent, org_rw.Headline):
                links.append({
                    "target": headline.parent.id,
                    "relation": "in"
                })
        for backlink in links:
            if 'relation' in  backlink and backlink['relation'] == 'in':
                continue

            target = backlink['target']
            if target.startswith('id:'):
                target = target[len('id:'):]

            if target not in backlink_graph:
                backlink_graph[target] = set()

            backlink_graph[target].add(headline.id)

        graph[headline.id] = {
            "title": org_rw.org_rw.token_list_to_plaintext(headline.title.contents).strip(),
            "links": links,
            "depth": headline.depth,
        }
        if headline.id in main_headline_to_docid:
            graph[main_headline_to_docid[headline.id]] = graph[headline.id]

        topLevelHeadline = headline
        while isinstance(topLevelHeadline.parent, org_rw.Headline):
            topLevelHeadline = topLevelHeadline.parent

        # Save for full-text-search
        cur.execute('''INSERT INTO note_search(note_id, title, body, top_level_title, is_done, is_todo) VALUES (?, ?, ?, ?, ?, ?);''',
                    (
                        headline.id,
                        headline.title.get_text(),
                        '\n'.join(headline.doc.dump_headline(headline, recursive=False)),
                        topLevelHeadline.title.get_text(),
                        headline.is_done,
                        headline.is_todo,
                    ))

    # Update graph, replace document ids with headline ids
    for headline_data in graph.values():
        for link in headline_data['links']:
            if link['target'] in doc_to_headline_remapping:
                link['target'] = doc_to_headline_remapping[link['target']]

    # Remap document ids backlinks to main headlines
    for doc_id, main_headline_id in doc_to_headline_remapping.items():
        if doc_id.startswith('id:'):
            doc_id = doc_id[len('id:'):]
        if main_headline_id.startswith('id:'):
            main_headline_id = main_headline_id[len('id:'):]
        for backlink in backlink_graph.get(doc_id, []):
            if main_headline_id not in backlink_graph:
                backlink_graph[main_headline_id] = set()
            backlink_graph[main_headline_id].add(backlink)

    # Render docs after we've built the graph
    # Render main headlines
    full_graph_info = { "nodes": graph, "backlinks": backlink_graph, "main_headlines": main_headlines_by_path }
    for _docpath, main_headline in main_headlines_by_path.items():
        if main_headline.doc.id:
            endpath = os.path.join(dest_top, main_headline.doc.id + ".node.html")
            with open(endpath, "wt") as f:
                f.write(render_as_document(main_headline, main_headline.doc, headlineLevel=0, graph=full_graph_info,
                                           title=org_rw.token_list_to_plaintext(main_headline.title.contents)))


    # Render all headlines
    for headline in all_headlines:
        endpath = os.path.join(dest_top, headline.id + ".node.html")

        # Render HTML
        with open(endpath, "wt") as f:
            f.write(render_as_document(headline, headline.doc, headlineLevel=0, graph=full_graph_info,
                                       title=org_rw.token_list_to_plaintext(headline.title.contents)))
            files_generated += 1

        if headline.id == INDEX_ID:
            index_endpath = os.path.join(dest_top, "index.html")
            with open(index_endpath, "wt") as f:
                f.write(render_as_document(headline, headline.doc, headlineLevel=0, graph=full_graph_info,
                                           title=org_rw.token_list_to_plaintext(headline.title.contents)))
                files_generated += 1

    # Output graph files
    graphpath = os.path.join(dest_top, "graph.json")
    graph_explorer_path = os.path.join(dest_top, "graph.html")
    with open(graphpath, "wt") as f:
        json.dump(obj=graph, fp=f, indent=2)
    graph_explorer_path = os.path.join(dest_top, "graph.html")
    with open(graph_explorer_path, 'wt') as f:
        with open(os.path.join(os.path.dirname(os.path.abspath(dest_top)), '..', 'static', 'graph_explorer.html'), 'rt') as template:
            source = template.read()
        f.write(source.replace('<!-- REPLACE_THIS_WITH_GRAPH -->',
                               json.dumps(graph)))
    logging.info("Generated {} files".format(files_generated))
    cur.close()
    db.commit()

def main(src_top, dest_top):
    notifier = inotify.adapters.InotifyTrees([src_top, STATIC_PATH])

    ## Initial load
    t0 = time.time()

    os.makedirs(dest_top, exist_ok=True)
    db = create_db(os.path.join(dest_top, 'db.sqlite3'))
    docs = regen_all(src_top, dest_top, db=db)

    if not WATCH:
        logging.info("Build completed in {:.2f}s".format(time.time() - t0))
        return 0

    logging.info("Initial load completed in {:.2f}s".format(time.time() - t0))
    ## Updating
    for event in notifier.event_gen(yield_nones=False):
        (ev, types, directory, file) = event
        if not any([type in MONITORED_EVENT_TYPES for type in types]):
            continue
        if is_git_path(directory):
            continue
        filepath = os.path.join(directory, file)
        print("CHANGED: {}".format(filepath))
        t0 = time.time()
        try:
            docs = regen_all(src_top, dest_top, docs=docs, db=db)
        except:
            logging.error(traceback.format_exc())
            logging.error("Loading new templates failed 😿")
            continue
        logging.info("Updated all in {:.2f}s".format(time.time() - t0))

def get_headline_with_name(target_name, doc):
    target_name = target_name.strip()
    for headline in doc.getAllHeadlines():
        if headline.title.get_text().strip() == target_name:
            return headline

    return None

def assert_id_exists(id, src_headline, graph):
    if id not in graph["nodes"]:
        raise NonExistingLocalNoteError(id, src_headline)

def print_tree(tree, indentation=0, headline=None):
    # if headline and headline.id != INDEX_ID:
    #     return
    return
    for element in tree:
        if "children" in dir(element):
            if len(element.children) > 0:
                print_element(element.children, indentation + 1, headline)
                print()

        elif "content" in dir(element):
            for content in element.content:
                print_element(content, indentation + 1, headline)

def print_element(element, indentation, headline):
    if isinstance(element, org_rw.Link):
        print(" " * indentation, "Link:", element.get_raw())
    elif isinstance(element, str):
        print(" " * indentation, "{" + element + "}", type(element))
    else:
        print_tree(element, indentation, headline)


def render_property_drawer(element, acc, headline, graph):
    pass


def render_logbook_drawer(element, acc, headline, graph):
    pass


def render_property_node(element, acc, headline, graph):
    pass


def render_list_group(element, acc, headline, graph):
    acc.append("<ul>")
    render_tree(element.children, acc, headline, graph)
    acc.append("</ul>")

def render_table(element, acc, graph, headline):
    acc.append("<table>")
    render_tree(element.children, acc, headline, graph)
    acc.append("</table>")

def render_table_row(element, acc, headline, graph):
    acc.append("<tr>")
    for cell in element.cells:
        acc.append("<td>")
        acc.append(html.escape(cell))
        acc.append("</td>")
    acc.append("</tr>")

def render_table_separator_row(element, acc, headline, graph):
    acc.append("<tr class='__table-separator'></tr>")

def render_list_item(element, acc, headline, graph):
    acc.append("<li>")
    if element.tag is not None:
        acc.append("<span class='tag'>")
        render_text_tokens(element.tag, acc, headline, graph)
        acc.append("</span>")

    acc.append("<span class='item'>")
    render_text_tokens(element.content, acc, headline, graph)
    acc.append("</span></li>")

def render_block(content, acc, _class, is_code):
    acc.append('<pre class="{}">'.format(_class))
    if is_code:
        acc.append('<code>')

    # Remove indentation common to all lines
    base_indentation = min([
        len(l) - len(l.lstrip(' '))
        for l in content.split('\n')
        if len(l.strip()) > 0
    ])
    content_lines = [
        l[base_indentation:]
        for l in content.split('\n')
    ]

    acc.append('\n'.join(content_lines))
    if is_code:
        acc.append('</code>')
    acc.append('</pre>')

def render_code_block(element, acc, headline, graph):
    code = element.lines
    if element.arguments is not None and len(element.arguments) > 0 :
        try:
            lexer = pygments.lexers.get_lexer_by_name(element.arguments.split()[0], stripall=True)
            content = pygments.highlight(code,
                                         lexer,
                                         pygments.formatters.HtmlFormatter()
                                         )
            acc.append(content)
            return

        except pygments.util.ClassNotFound:
            pass
    logging.error("Cannot find lexer for {}".format(element.subtype.lower()))
    content = html.escape(code)
    render_block(content, acc, _class='code ' + element.subtype.lower(), is_code=True)


def render_results_block(element, acc, headline, graph):
    items = [e.get_raw() for e in element.children]
    content = '\n'.join(items)
    if len(content.strip()) > 0:
        render_block(content, acc, _class='results lang-text', is_code=False)

def render_org_text(element, acc, headline, graph):
    as_dom = org_rw.text_to_dom(element.contents, element)
    render_text_tokens(as_dom, acc, headline, graph)

def render_text(element, acc, headline, graph):
    acc.append('<div class="text">')
    render_text_tokens(element.content, acc, headline, graph)
    acc.append('</div>')

def render_text_tokens(tokens, acc, headline, graph):
    acc.append('<p>')
    if isinstance(tokens, org_rw.Text):
        tokens = tokens.contents
    for chunk in tokens:
        if isinstance(chunk, str):
            lines = chunk.split('\n\n')
            contents = []
            for line in lines:
                line_chunks = []
                for word in TEXT_OR_LINK_RE.findall(line):
                    if '://' in word and not (word.startswith('org-protocol://')):
                        if not (word.startswith('http://')
                                or word.startswith('https://')
                                or word.startswith('ftp://')
                                or word.startswith('ftps://')
                                ):
                            logging.warning('Is this a link? {} (on {})\nLine: {}\nChunks: {}'.format(word, headline.doc.path, line, line_chunks))
                            line_chunks.append(html.escape(word))
                        else:
                            line_chunks.append('<a href="{url}" class="external">{description}</a>'
                                               .format(url=word,
                                                       description=html.escape(word)))
                    else:
                        line_chunks.append(html.escape(word))
                contents.append(''.join(line_chunks))

            acc.append('<span class="line">{}</span>'.format('</p><p>'.join(contents)))

        elif isinstance(chunk, Link):
            link_target = chunk.value
            is_internal_link = True
            description = chunk.description
            if description is None:
                description = chunk.value

            try:
                if link_target.startswith('id:'):
                    assert_id_exists(link_target[3:], headline, graph)
                    link_target = './' + link_target[3:] + '.node.html'
                elif link_target.startswith('./') or link_target.startswith('../'):
                    if '::' in link_target:
                        logging.warning('Not implemented headline links to other files. Used on {}'.format(link_target))

                    else:
                        target_path = os.path.abspath(os.path.join(os.path.dirname(headline.doc.path), link_target))
                        if target_path not in graph['main_headlines']:
                            logging.warning('Link to doc not in graph: {}'.format(target_path))
                        else:
                            assert_id_exists(graph['main_headlines'][target_path].id, headline, graph)
                            link_target = './' + graph['main_headlines'][target_path].id + '.node.html'
                elif link_target.startswith('attachment:'):
                    logging.warning('Not implemented `attachment:` links. Used on {}'.format(link_target))
                elif link_target.startswith('* '):
                    target_headline = get_headline_with_name(link_target.lstrip('* '), headline.doc)
                    if target_headline is None:
                        logging.warning('No headline found corresponding to {}. On file {}'.format(link_target, headline.doc.path))
                    else:
                        assert_id_exists(target_headline.id, headline, graph)
                        link_target = './' + target_headline.id + '.node.html'
                else:
                    is_internal_link = False
                    if not (
                        link_target.startswith('https://')
                        or link_target.startswith('http://')
                        or link_target.startswith('/')
                    ):
                        raise NotImplementedError('Unknown link type: {}'
                                                  .format(link_target))

                acc.append('<a href="{}" class="{}" >{}</a>'.format(
                    html.escape(link_target),
                    'internal' if is_internal_link else 'external',
                    html.escape(description),
                ))
            except NonExistingLocalNoteError as err:
                logging.warning(err.get_message())
                acc.append(html.escape(description))
        # else:
        #     raise NotImplementedError('TextToken: {}'.format(chunk))
    acc.append('</p>')


def render_tag(element, acc, headline, graph):
    return {
        dom.PropertyDrawerNode: render_property_drawer,
        dom.LogbookDrawerNode: render_logbook_drawer,
        dom.PropertyNode: render_property_node,
        dom.ListGroupNode: render_list_group,
        dom.ListItem: render_list_item,
        dom.TableNode: render_table,
        dom.TableSeparatorRow: render_table_separator_row,
        dom.TableRow: render_table_row,
        dom.CodeBlock: render_code_block,
        dom.Text: render_text,
        dom.ResultsDrawerNode: render_results_block,
        org_rw.Text: render_org_text,
    }[type(element)](element, acc, headline, graph)


def render_tree(tree, acc, headline, graph):
    for element in tree:
        render_tag(element, acc, headline, graph)

def render_inline(tree, f, headline, graph):
    acc = []
    f(tree, acc, headline, graph)
    return ''.join(acc)


def render_as_document(headline, doc, headlineLevel, graph, title):
    if isinstance(headline.parent, org_rw.Headline):
        topLevelHeadline = headline.parent
        while isinstance(topLevelHeadline.parent, org_rw.Headline):
            topLevelHeadline = topLevelHeadline.parent
        return f"""<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <title>{title} @ {SITE_NAME}</title>
    <meta http-equiv="refresh" content="0;./{topLevelHeadline.id}.node.html#{headline.id}" />
    <link href="../static/style.css" rel="stylesheet"/>
    <link href="../static/light-syntax.css" rel="stylesheet"/>
  </head>
  <body>
    <nav>
      <h1><a href="./index.html">Código para llevar [Notes]</a></h1>
    </nav>
    <a href='./{topLevelHeadline.id}.node.html#{headline.id}'>Sending you to the main note... [{org_rw.token_list_to_plaintext(topLevelHeadline.title.contents)}]</a>
  </body>
</html>
        """
    else:
        return as_document(render(headline, doc, graph=graph, headlineLevel=headlineLevel), title)

def render_connections(headline_id, content, graph):
    if headline_id not in graph['backlinks']:
        return

    content.append("<div class='connections'><span class='backlink-explanation'>These notes link here:</span><ul>")
    for backlink in sorted(graph['backlinks'][headline_id], key=lambda x: graph['nodes'][x]['title']):
        link = graph["nodes"][backlink]
        title = link["title"]
        content.append(f"<li><a class='internal backlink' href='./{backlink}.node.html'>{html.escape(title)}</a></li>")
    content.append("</ul></div>")

def render(headline, doc, graph, headlineLevel):
    try:
        dom = headline.as_dom()
    except:
        logging.error("Error generating DOM for {}".format(doc.path))
        raise
    print_tree(dom, indentation=2, headline=headline)

    content = []
    render_tree(dom, content, headline, graph)
    if headline.id:
        render_connections(headline.id, content, graph)

    for child in headline.children:
        content.append(render(child, doc, headlineLevel=headlineLevel+1, graph=graph))

    if headline.state is None:
        state = ""
    else:
        state = f'<span class="state todo-{headline.is_todo} state-{headline.state}">{headline.state}</span>'

    if headline.is_todo:
        todo_state = "todo"
    else:
        todo_state = "done"

    tag_list = []
    for tag in headline.shallow_tags:
        tag_list.append(f'<span class="tag">{html.escape(tag)}</span>')
    tags = f'<span class="tags">{"".join(tag_list)}</span>'

    display_state = 'expanded'
    # Update display based on document STARTUP config
    visual_level = doc.get_keywords('STARTUP', 'showall')
    if visual_level.startswith('show') and visual_level.endswith('levels'):
        visual_level_num = int(visual_level[len('show'):-len('levels')]) - 1
        # Note that level is 0 indexed inside this loop
        if headlineLevel >= visual_level_num:
            display_state = 'collapsed'

    title = render_inline(headline.title, render_tag, headline, graph)

    if headlineLevel > 0:
        title = f"<a href=\"javascript:toggle_expand('{html.escape(headline.id)}')\">{title}</a>"

    return f"""
<div id="{html.escape(headline.id)}" class="node {todo_state} {display_state}">
  <h1 class="title">
    {state}
    {title}
    {tags}
  </h1>
  <div class='contents'>
    {''.join(content)}
  </div>
</div>
"""


def as_document(html, title):
    return f"""<!DOCTYPE html>
<html>
  <head>
    <meta charset="utf-8">
    <title>{title} @ {SITE_NAME}</title>
    <link href="../static/style.css" rel="stylesheet"/>
    <link href="../static/light-syntax.css" rel="stylesheet"/>
    <script type="text/javascript">
      function toggle_expand(header_id) {{
        var e = document.getElementById(header_id);
        if (e.classList.contains('expanded')) {{
          e.classList.add('collapsed');
          e.classList.remove('expanded');
        }}
        else {{
          e.classList.add('expanded');
          e.classList.remove('collapsed');
        }}
      }}
    </script>
  </head>
  <body>
    <nav>
      <h1><a href="./index.html">Código para llevar [Notes]</a></h1>
      <input type="text" id="searchbox" disabled="true" placeholder="Search (requires JS)" />
    </nav>
    {html}

    <script src="../static/search-box.js"></script>
    <script tye="text/javascript">_codigoparallevar_enable_search_box('#searchbox', {{placeholder: 'Search...'}})</script>
  </body>
</html>
    """


def save_changes(doc):
    assert doc.path is not None
    with open(doc.path, "wt") as f:
        dump_org(doc, f)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: {} SOURCE_TOP DEST_TOP".format(sys.argv[0]))
        exit(0)

    logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
    exit(main(sys.argv[1], sys.argv[2]))