Improve link checking.

This commit is contained in:
Sergio Martínez Portela 2023-10-03 00:09:08 +02:00
parent 600e737767
commit 23f8fcefe5

View File

@ -3,6 +3,7 @@
import logging import logging
import os import os
import sys import sys
import urllib.parse
from bs4 import BeautifulSoup as bs4 from bs4 import BeautifulSoup as bs4
@ -19,25 +20,32 @@ def main(files_top):
print("\r{} files".format(len(found_files)), end='', flush=True) print("\r{} files".format(len(found_files)), end='', flush=True)
print() print()
found_broken = False found_broken = 0
for fpath in tqdm(found_files): for fpath in tqdm(found_files):
with open(fpath) as f: with open(fpath) as f:
tree = bs4(f.read(), features='lxml', parser='html5') tree = bs4(f.read(), features='lxml', parser='html5')
for link in tree.find_all('a'):
if 'href' not in link.attrs: for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]:
continue for link in tree.find_all(tag):
if ':' in link['href']: if attr not in link.attrs:
continue continue
if link['href'].startswith('/'): link.attrs[attr] = link.attrs[attr].split('#')[0]
target = link['href'] # TODO: Find a better way to model the root if not link.attrs[attr]:
else: continue
target = os.path.join(os.path.dirname(fpath), link['href']) if ':' in link[attr]:
if os.path.isdir(target): continue
pass if link[attr].startswith('/'):
elif not os.path.exists(target): target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/')))
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href'])) else:
target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr]))
if os.path.isdir(target):
pass
elif not os.path.exists(target):
print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr]))
found_broken += 1
if found_broken: if found_broken:
print(f"Found {found_broken} broken links")
exit(1) exit(1)
else: else:
exit(0) exit(0)