52 lines
1.5 KiB
Python
52 lines
1.5 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import logging
|
||
|
import os
|
||
|
import sys
|
||
|
|
||
|
from bs4 import BeautifulSoup as bs4
|
||
|
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
def main(files_top):
|
||
|
|
||
|
print("Listing files...")
|
||
|
found_files = []
|
||
|
for root, dirs, files in os.walk(files_top):
|
||
|
for name in files:
|
||
|
if name.endswith('.html'):
|
||
|
found_files.append(os.path.join(root, name))
|
||
|
print("\r{} files".format(len(found_files)), end='', flush=True)
|
||
|
|
||
|
print()
|
||
|
found_broken = False
|
||
|
for fpath in tqdm(found_files):
|
||
|
with open(fpath) as f:
|
||
|
tree = bs4(f.read(), features='lxml', parser='html5')
|
||
|
for link in tree.find_all('a'):
|
||
|
if 'href' not in link.attrs:
|
||
|
continue
|
||
|
if ':' in link['href']:
|
||
|
continue
|
||
|
if link['href'].startswith('/'):
|
||
|
target = link['href'] # TODO: Find a better way to model the root
|
||
|
else:
|
||
|
target = os.path.join(os.path.dirname(fpath), link['href'])
|
||
|
if os.path.isdir(target):
|
||
|
pass
|
||
|
elif not os.path.exists(target):
|
||
|
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href']))
|
||
|
|
||
|
if found_broken:
|
||
|
exit(1)
|
||
|
else:
|
||
|
exit(0)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
if len(sys.argv) != 2:
|
||
|
print("Usage: {} FILES_TOP".format(sys.argv[0]))
|
||
|
exit(0)
|
||
|
|
||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
|
||
|
exit(main(sys.argv[1]))
|