Implement proper article summarization for index.
This commit is contained in:
parent
da6536ff53
commit
f1781bb1ae
@ -12,6 +12,7 @@ MARKDOWN_EXTRA_FEATURES = [
|
|||||||
'markdown.extensions.extra',
|
'markdown.extensions.extra',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
@ -24,12 +25,15 @@ import re
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from bs4 import BeautifulSoup as bs4
|
from bs4 import BeautifulSoup as bs4
|
||||||
|
import bs4 as BeautifulSoup
|
||||||
import jinja2
|
import jinja2
|
||||||
import inotify.adapters
|
import inotify.adapters
|
||||||
import yaml
|
import yaml
|
||||||
import markdown
|
import markdown
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
|
||||||
|
SUMMARIZE_MAX_TOKENS = 1000
|
||||||
|
|
||||||
NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$')
|
NIKOLA_DATE_RE = re.compile(r'^([0-2]\d|30|31)\.(0\d|1[012])\.(\d{4}), (\d{1,2}):(\d{2})$')
|
||||||
|
|
||||||
COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) '
|
COMPLETE_DATE_RE = re.compile(r'^(\d{4})-(0\d|1[012])-([0-2]\d|30|31) '
|
||||||
@ -192,7 +196,88 @@ def render_article(doc, front_matter, f):
|
|||||||
f.write(result)
|
f.write(result)
|
||||||
|
|
||||||
def summarize(doc):
|
def summarize(doc):
|
||||||
return bs4(doc, features='lxml').text[:1000]
|
tree = bs4(doc, features='lxml')
|
||||||
|
|
||||||
|
html = list(tree.children)[0]
|
||||||
|
body = list(html.children)[0]
|
||||||
|
|
||||||
|
comments = tree.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
|
||||||
|
|
||||||
|
teaser_end = None
|
||||||
|
for comment in comments:
|
||||||
|
if 'TEASER_END' in comment:
|
||||||
|
teaser_end = comment
|
||||||
|
break
|
||||||
|
|
||||||
|
if 'gnucash' in doc:
|
||||||
|
assert teaser_end is not None
|
||||||
|
|
||||||
|
def recur_select_to_summarize(source, dest, num_tokens):
|
||||||
|
for item in source.children:
|
||||||
|
if num_tokens + len(item.text) < SUMMARIZE_MAX_TOKENS:
|
||||||
|
# All source fits
|
||||||
|
num_tokens += len(item.text)
|
||||||
|
dest.append(item)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if not isinstance(item, BeautifulSoup.NavigableString):
|
||||||
|
# Let's take as much source as we can and then stop
|
||||||
|
subsect = bs4()
|
||||||
|
recur_select_to_summarize(item, subsect, num_tokens)
|
||||||
|
|
||||||
|
if len(list(subsect.children)) > 0:
|
||||||
|
dest.append(subsect)
|
||||||
|
break
|
||||||
|
|
||||||
|
def cut_after_element(reference):
|
||||||
|
while reference.next_sibling is None:
|
||||||
|
if reference.parent is None:
|
||||||
|
logging.warning("Reached root when looking for cutting point for teaser. Doc: {}".format(doc[:100]))
|
||||||
|
return
|
||||||
|
reference = reference.parent
|
||||||
|
|
||||||
|
nxt = reference.next_sibling
|
||||||
|
while nxt is not None:
|
||||||
|
was = nxt
|
||||||
|
if reference.next_sibling is not None:
|
||||||
|
# Move to the "right"
|
||||||
|
nxt = reference.next_sibling
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Move "up and right"
|
||||||
|
nxt = reference.parent
|
||||||
|
if nxt is not None:
|
||||||
|
nxt = nxt.next_sibling
|
||||||
|
was.extract()
|
||||||
|
|
||||||
|
if teaser_end is None:
|
||||||
|
result = bs4()
|
||||||
|
|
||||||
|
recur_select_to_summarize(body, result, 0)
|
||||||
|
else:
|
||||||
|
summary = copy.copy(body)
|
||||||
|
comments = summary.find_all(string=lambda text: isinstance(text, BeautifulSoup.Comment))
|
||||||
|
|
||||||
|
teaser_end = None
|
||||||
|
for comment in comments:
|
||||||
|
if 'TEASER_END' in comment:
|
||||||
|
teaser_end = comment
|
||||||
|
break
|
||||||
|
assert teaser_end is not None, 'Error finding teaser end on copy'
|
||||||
|
|
||||||
|
cut_after_element(teaser_end)
|
||||||
|
result = summary
|
||||||
|
|
||||||
|
# Update summary links and hrefs
|
||||||
|
for v in result.find_all('video') + result.find_all('image'):
|
||||||
|
if 'src' in v.attrs and ':' not in v['src']:
|
||||||
|
v['src'] = '/blog/' + v['src'].lstrip('/')
|
||||||
|
|
||||||
|
for v in result.find_all('a'):
|
||||||
|
if 'href' in v.attrs and ':' not in v['href']:
|
||||||
|
v['href'] = '/blog/' + v['href'].lstrip('/')
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def render_index(docs, dest_top):
|
def render_index(docs, dest_top):
|
||||||
docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True)
|
docs = sorted(docs.values(), key=lambda x: x[1]['date'], reverse=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user