From f1781bb1aeff338430741ae68791af0363b0513e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?=
 <sergio@codigoparallevar.com>
Date: Sat, 16 Sep 2023 23:22:19 +0200
Subject: [PATCH] Implement proper article summarization for index.

---
 scripts/blog.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/scripts/blog.py b/scripts/blog.py
index 7ddc083..6ca49af 100644
--- a/scripts/blog.py
+++ b/scripts/blog.py
@@ -12,6 +12,7 @@ MARKDOWN_EXTRA_FEATURES = [
     'markdown.extensions.extra',
 ]
 
+import copy
 import json
 import logging
 import sys
@@ -24,12 +25,15 @@ import re
 from typing import List
 
 from bs4 import BeautifulSoup as bs4
+import bs4 as BeautifulSoup
 import jinja2
 import inotify.adapters
 import yaml
 import markdown
 from unidecode import unidecode
 
+SUMMARIZE_MAX_TOKENS = 1000
+
 NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$')
 
 COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) '
@@ -192,7 +196,88 @@ def render_article(doc, front_matter, f):
     f.write(result)
 
 def summarize(doc):
-    return bs4(doc, features='lxml').text[:1000]
+    tree = bs4(doc, features='lxml')
+
+    html = list(tree.children)[0]
+    body = list(html.children)[0]
+
+    comments = tree.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
+
+    teaser_end = None
+    for comment in comments:
+        if 'TEASER_END' in comment:
+            teaser_end = comment
+            break
+
+    if 'gnucash' in doc:
+        assert teaser_end is not None
+
+    def recur_select_to_summarize(source, dest, num_tokens):
+        for item in source.children:
+            if num_tokens + len(item.text) < SUMMARIZE_MAX_TOKENS:
+                # All source fits
+                num_tokens += len(item.text)
+                dest.append(item)
+
+            else:
+                if not isinstance(item, BeautifulSoup.NavigableString):
+                    # Let's take as much source as we can and then stop
+                    subsect = bs4()
+                    recur_select_to_summarize(item, subsect, num_tokens)
+
+                    if len(list(subsect.children)) > 0:
+                        dest.append(subsect)
+                break
+
+    def cut_after_element(reference):
+        while reference.next_sibling is None:
+            if reference.parent is None:
+                logging.warning("Reached root when looking for cutting point for teaser. Doc: {}".format(doc[:100]))
+                return
+            reference = reference.parent
+
+        nxt = reference.next_sibling
+        while nxt is not None:
+            was = nxt
+            if reference.next_sibling is not None:
+                # Move to the "right"
+                nxt = reference.next_sibling
+
+            else:
+                # Move "up and right"
+                nxt = reference.parent
+                if nxt is not None:
+                    nxt = nxt.next_sibling
+            was.extract()
+
+    if teaser_end is None:
+        result = bs4()
+
+        recur_select_to_summarize(body, result, 0)
+    else:
+        summary = copy.copy(body)
+        comments = summary.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
+
+        teaser_end = None
+        for comment in comments:
+            if 'TEASER_END' in comment:
+                teaser_end = comment
+                break
+        assert teaser_end is not None, 'Error finding teaser end on copy'
+
+        cut_after_element(teaser_end)
+        result = summary
+
+    # Update summary links and hrefs
+    for v in result.find_all('video') + result.find_all('image'):
+        if 'src' in v.attrs and ':' not in v['src']:
+            v['src'] = '/blog/' + v['src'].lstrip('/')
+
+    for v in result.find_all('a'):
+        if 'href' in v.attrs and ':' not in v['href']:
+            v['href'] = '/blog/' + v['href'].lstrip('/')
+
+    return result
 
 def render_index(docs, dest_top):
     docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True)