#!/usr/bin/env python3 import logging import os import sys from bs4 import BeautifulSoup as bs4 from tqdm import tqdm def main(files_top): print("Listing files...") found_files = [] for root, dirs, files in os.walk(files_top): for name in files: if name.endswith('.html'): found_files.append(os.path.join(root, name)) print("\r{} files".format(len(found_files)), end='', flush=True) print() found_broken = False for fpath in tqdm(found_files): with open(fpath) as f: tree = bs4(f.read(), features='lxml', parser='html5') for link in tree.find_all('a'): if 'href' not in link.attrs: continue if ':' in link['href']: continue if link['href'].startswith('/'): target = link['href'] # TODO: Find a better way to model the root else: target = os.path.join(os.path.dirname(fpath), link['href']) if os.path.isdir(target): pass elif not os.path.exists(target): print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href'])) if found_broken: exit(1) else: exit(0) if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: {} FILES_TOP".format(sys.argv[0])) exit(0) logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s") exit(main(sys.argv[1]))