From 816cedea4d2c9a18d5e765720f26e7477fde058b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Fri, 22 Sep 2023 00:05:24 +0200 Subject: [PATCH] wip: Script to test links for broken ones. --- scripts/test-links.py | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/test-links.py diff --git a/scripts/test-links.py b/scripts/test-links.py new file mode 100644 index 0000000..5f142e2 --- /dev/null +++ b/scripts/test-links.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import logging +import os +import sys + +from bs4 import BeautifulSoup as bs4 + +from tqdm import tqdm + +def main(files_top): + + print("Listing files...") + found_files = [] + for root, dirs, files in os.walk(files_top): + for name in files: + if name.endswith('.html'): + found_files.append(os.path.join(root, name)) + print("\r{} files".format(len(found_files)), end='', flush=True) + + print() + found_broken = False + for fpath in tqdm(found_files): + with open(fpath) as f: + tree = bs4(f.read(), features='lxml', parser='html5') + for link in tree.find_all('a'): + if 'href' not in link.attrs: + continue + if ':' in link['href']: + continue + if link['href'].startswith('/'): + target = link['href'] # TODO: Find a better way to model the root + else: + target = os.path.join(os.path.dirname(fpath), link['href']) + if os.path.isdir(target): + pass + elif not os.path.exists(target): + print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href'])) + + if found_broken: + exit(1) + else: + exit(0) + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: {} FILES_TOP".format(sys.argv[0])) + exit(0) + + logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s") + exit(main(sys.argv[1]))