Improve link checking.
This commit is contained in:
parent
600e737767
commit
23f8fcefe5
@ -3,6 +3,7 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import urllib.parse
|
||||
|
||||
from bs4 import BeautifulSoup as bs4
|
||||
|
||||
@ -19,25 +20,32 @@ def main(files_top):
|
||||
print("\r{} files".format(len(found_files)), end='', flush=True)
|
||||
|
||||
print()
|
||||
found_broken = False
|
||||
found_broken = 0
|
||||
for fpath in tqdm(found_files):
|
||||
with open(fpath) as f:
|
||||
tree = bs4(f.read(), features='lxml', parser='html5')
|
||||
for link in tree.find_all('a'):
|
||||
if 'href' not in link.attrs:
|
||||
continue
|
||||
if ':' in link['href']:
|
||||
continue
|
||||
if link['href'].startswith('/'):
|
||||
target = link['href'] # TODO: Find a better way to model the root
|
||||
else:
|
||||
target = os.path.join(os.path.dirname(fpath), link['href'])
|
||||
if os.path.isdir(target):
|
||||
pass
|
||||
elif not os.path.exists(target):
|
||||
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href']))
|
||||
|
||||
for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]:
|
||||
for link in tree.find_all(tag):
|
||||
if attr not in link.attrs:
|
||||
continue
|
||||
link.attrs[attr] = link.attrs[attr].split('#')[0]
|
||||
if not link.attrs[attr]:
|
||||
continue
|
||||
if ':' in link[attr]:
|
||||
continue
|
||||
if link[attr].startswith('/'):
|
||||
target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/')))
|
||||
else:
|
||||
target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr]))
|
||||
if os.path.isdir(target):
|
||||
pass
|
||||
elif not os.path.exists(target):
|
||||
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr]))
|
||||
found_broken += 1
|
||||
|
||||
if found_broken:
|
||||
print(f"Found {found_broken} broken links")
|
||||
exit(1)
|
||||
else:
|
||||
exit(0)
|
||||
|
Loading…
Reference in New Issue
Block a user