Implement proper article summarization for index.

This commit is contained in:
Sergio Martínez Portela 2023-09-16 23:22:19 +02:00
parent da6536ff53
commit f1781bb1ae

View File

@ -12,6 +12,7 @@ MARKDOWN_EXTRA_FEATURES = [
'markdown.extensions.extra', 'markdown.extensions.extra',
] ]
import copy
import json import json
import logging import logging
import sys import sys
@ -24,12 +25,15 @@ import re
from typing import List from typing import List
from bs4 import BeautifulSoup as bs4 from bs4 import BeautifulSoup as bs4
import bs4 as BeautifulSoup
import jinja2 import jinja2
import inotify.adapters import inotify.adapters
import yaml import yaml
import markdown import markdown
from unidecode import unidecode from unidecode import unidecode
SUMMARIZE_MAX_TOKENS = 1000
NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$') NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$')
COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) ' COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) '
@ -192,7 +196,88 @@ def render_article(doc, front_matter, f):
f.write(result) f.write(result)
def summarize(doc): def summarize(doc):
return bs4(doc, features='lxml').text[:1000] tree = bs4(doc, features='lxml')
html = list(tree.children)[0]
body = list(html.children)[0]
comments = tree.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
teaser_end = None
for comment in comments:
if 'TEASER_END' in comment:
teaser_end = comment
break
if 'gnucash' in doc:
assert teaser_end is not None
def recur_select_to_summarize(source, dest, num_tokens):
for item in source.children:
if num_tokens + len(item.text) < SUMMARIZE_MAX_TOKENS:
# All source fits
num_tokens += len(item.text)
dest.append(item)
else:
if not isinstance(item, BeautifulSoup.NavigableString):
# Let's take as much source as we can and then stop
subsect = bs4()
recur_select_to_summarize(item, subsect, num_tokens)
if len(list(subsect.children)) > 0:
dest.append(subsect)
break
def cut_after_element(reference):
while reference.next_sibling is None:
if reference.parent is None:
logging.warning("Reached root when looking for cutting point for teaser. Doc: {}".format(doc[:100]))
return
reference = reference.parent
nxt = reference.next_sibling
while nxt is not None:
was = nxt
if reference.next_sibling is not None:
# Move to the "right"
nxt = reference.next_sibling
else:
# Move "up and right"
nxt = reference.parent
if nxt is not None:
nxt = nxt.next_sibling
was.extract()
if teaser_end is None:
result = bs4()
recur_select_to_summarize(body, result, 0)
else:
summary = copy.copy(body)
comments = summary.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
teaser_end = None
for comment in comments:
if 'TEASER_END' in comment:
teaser_end = comment
break
assert teaser_end is not None, 'Error finding teaser end on copy'
cut_after_element(teaser_end)
result = summary
# Update summary links and hrefs
for v in result.find_all('video') + result.find_all('image'):
if 'src' in v.attrs and ':' not in v['src']:
v['src'] = '/blog/' + v['src'].lstrip('/')
for v in result.find_all('a'):
if 'href' in v.attrs and ':' not in v['href']:
v['href'] = '/blog/' + v['href'].lstrip('/')
return result
def render_index(docs, dest_top): def render_index(docs, dest_top):
docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True) docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True)