From 23f8fcefe56523a3b25c1d68c4f96f385d6340ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Tue, 3 Oct 2023 00:09:08 +0200 Subject: [PATCH] Improve link checking. --- scripts/test-links.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/scripts/test-links.py b/scripts/test-links.py index 5f142e2..1fbd25e 100644 --- a/scripts/test-links.py +++ b/scripts/test-links.py @@ -3,6 +3,7 @@ import logging import os import sys +import urllib.parse from bs4 import BeautifulSoup as bs4 @@ -19,25 +20,32 @@ def main(files_top): print("\r{} files".format(len(found_files)), end='', flush=True) print() - found_broken = False + found_broken = 0 for fpath in tqdm(found_files): with open(fpath) as f: tree = bs4(f.read(), features='lxml', parser='html5') - for link in tree.find_all('a'): - if 'href' not in link.attrs: - continue - if ':' in link['href']: - continue - if link['href'].startswith('/'): - target = link['href'] # TODO: Find a better way to model the root - else: - target = os.path.join(os.path.dirname(fpath), link['href']) - if os.path.isdir(target): - pass - elif not os.path.exists(target): - print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href'])) + + for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]: + for link in tree.find_all(tag): + if attr not in link.attrs: + continue + link.attrs[attr] = link.attrs[attr].split('#')[0] + if not link.attrs[attr]: + continue + if ':' in link[attr]: + continue + if link[attr].startswith('/'): + target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/'))) + else: + target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr])) + if os.path.isdir(target): + pass + elif not os.path.exists(target): + print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr])) + found_broken += 1 if found_broken: + print(f"Found {found_broken} broken links") exit(1) else: exit(0)