60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
import urllib.parse
|
|
|
|
from bs4 import BeautifulSoup as bs4
|
|
|
|
from tqdm import tqdm
|
|
|
|
def main(files_top):
|
|
|
|
print("Listing files...")
|
|
found_files = []
|
|
for root, dirs, files in os.walk(files_top):
|
|
for name in files:
|
|
if name.endswith('.html'):
|
|
found_files.append(os.path.join(root, name))
|
|
print("\r{} files".format(len(found_files)), end='', flush=True)
|
|
|
|
print()
|
|
found_broken = 0
|
|
for fpath in tqdm(found_files):
|
|
with open(fpath) as f:
|
|
tree = bs4(f.read(), features='lxml', parser='html5')
|
|
|
|
for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]:
|
|
for link in tree.find_all(tag):
|
|
if attr not in link.attrs:
|
|
continue
|
|
link.attrs[attr] = link.attrs[attr].split('#')[0]
|
|
if not link.attrs[attr]:
|
|
continue
|
|
if ':' in link[attr]:
|
|
continue
|
|
if link[attr].startswith('/'):
|
|
target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/')))
|
|
else:
|
|
target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr]))
|
|
if os.path.isdir(target):
|
|
pass
|
|
elif not os.path.exists(target):
|
|
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr]))
|
|
found_broken += 1
|
|
|
|
if found_broken:
|
|
print(f"Found {found_broken} broken links")
|
|
exit(1)
|
|
else:
|
|
exit(0)
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print("Usage: {} FILES_TOP".format(sys.argv[0]))
|
|
exit(0)
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)-8s %(message)s")
|
|
exit(main(sys.argv[1]))
|