#!/usr/bin/env python3 import logging import os import sys import urllib.parse from bs4 import BeautifulSoup as bs4 from tqdm import tqdm def main(files_top): print("Listing files...") found_files = [] for root, dirs, files in os.walk(files_top): for name in files: if name.endswith('.html'): found_files.append(os.path.join(root, name)) print("\r{} files".format(len(found_files)), end='', flush=True) print() found_broken = 0 for fpath in tqdm(found_files): with open(fpath) as f: tree = bs4(f.read(), features='lxml', parser='html5') for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]: for link in tree.find_all(tag): if attr not in link.attrs: continue link.attrs[attr] = link.attrs[attr].split('#')[0] if not link.attrs[attr]: continue if ':' in link[attr]: continue if link[attr].startswith('/'): target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/'))) else: target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr])) if os.path.isdir(target): pass elif not os.path.exists(target): print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr])) found_broken += 1 if found_broken: print(f"Found {found_broken} broken links") exit(1) else: exit(0) if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: {} FILES_TOP".format(sys.argv[0])) exit(0) logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s") exit(main(sys.argv[1]))