From f1781bb1aeff338430741ae68791af0363b0513e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Sat, 16 Sep 2023 23:22:19 +0200 Subject: [PATCH] Implement proper article summarization for index. --- scripts/blog.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/scripts/blog.py b/scripts/blog.py index 7ddc083..6ca49af 100644 --- a/scripts/blog.py +++ b/scripts/blog.py @@ -12,6 +12,7 @@ MARKDOWN_EXTRA_FEATURES = [ 'markdown.extensions.extra', ] +import copy import json import logging import sys @@ -24,12 +25,15 @@ import re from typing import List from bs4 import BeautifulSoup as bs4 +import bs4 as BeautifulSoup import jinja2 import inotify.adapters import yaml import markdown from unidecode import unidecode +SUMMARIZE_MAX_TOKENS = 1000 + NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$') COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) ' @@ -192,7 +196,88 @@ def render_article(doc, front_matter, f): f.write(result) def summarize(doc): - return bs4(doc, features='lxml').text[:1000] + tree = bs4(doc, features='lxml') + + html = list(tree.children)[0] + body = list(html.children)[0] + + comments = tree.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment)) + + teaser_end = None + for comment in comments: + if 'TEASER_END' in comment: + teaser_end = comment + break + + if 'gnucash' in doc: + assert teaser_end is not None + + def recur_select_to_summarize(source, dest, num_tokens): + for item in source.children: + if num_tokens + len(item.text) < SUMMARIZE_MAX_TOKENS: + # All source fits + num_tokens += len(item.text) + dest.append(item) + + else: + if not isinstance(item, BeautifulSoup.NavigableString): + # Let's take as much source as we can and then stop + subsect = bs4() + recur_select_to_summarize(item, subsect, num_tokens) + + if len(list(subsect.children)) > 0: + dest.append(subsect) + break + + def cut_after_element(reference): + while reference.next_sibling is None: + if reference.parent is None: + logging.warning("Reached root when looking for cutting point for teaser. Doc: {}".format(doc[:100])) + return + reference = reference.parent + + nxt = reference.next_sibling + while nxt is not None: + was = nxt + if reference.next_sibling is not None: + # Move to the "right" + nxt = reference.next_sibling + + else: + # Move "up and right" + nxt = reference.parent + if nxt is not None: + nxt = nxt.next_sibling + was.extract() + + if teaser_end is None: + result = bs4() + + recur_select_to_summarize(body, result, 0) + else: + summary = copy.copy(body) + comments = summary.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment)) + + teaser_end = None + for comment in comments: + if 'TEASER_END' in comment: + teaser_end = comment + break + assert teaser_end is not None, 'Error finding teaser end on copy' + + cut_after_element(teaser_end) + result = summary + + # Update summary links and hrefs + for v in result.find_all('video') + result.find_all('image'): + if 'src' in v.attrs and ':' not in v['src']: + v['src'] = '/blog/' + v['src'].lstrip('/') + + for v in result.find_all('a'): + if 'href' in v.attrs and ':' not in v['href']: + v['href'] = '/blog/' + v['href'].lstrip('/') + + return result def render_index(docs, dest_top): docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True)