From 23f8fcefe56523a3b25c1d68c4f96f385d6340ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?=
 <sergio@codigoparallevar.com>
Date: Tue, 3 Oct 2023 00:09:08 +0200
Subject: [PATCH] Improve link checking.

---
 scripts/test-links.py | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/scripts/test-links.py b/scripts/test-links.py
index 5f142e2..1fbd25e 100644
--- a/scripts/test-links.py
+++ b/scripts/test-links.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import sys
+import urllib.parse
 
 from bs4 import BeautifulSoup as bs4
 
@@ -19,25 +20,32 @@ def main(files_top):
                 print("\r{} files".format(len(found_files)), end='', flush=True)
 
     print()
-    found_broken = False
+    found_broken = 0
     for fpath in tqdm(found_files):
         with open(fpath) as f:
             tree = bs4(f.read(), features='lxml', parser='html5')
-        for link in tree.find_all('a'):
-            if 'href' not in link.attrs:
-                continue
-            if ':' in link['href']:
-                continue
-            if link['href'].startswith('/'):
-                target = link['href']  # TODO: Find a better way to model the root
-            else:
-                target = os.path.join(os.path.dirname(fpath), link['href'])
-            if os.path.isdir(target):
-                pass
-            elif not os.path.exists(target):
-                print("[{}] -[ error ]-> {} | {}".format(fpath, target, link['href']))
+
+        for tag, attr in [('a', 'href'), ('img', 'src'), ('audio', 'src'), ('video', 'src')]:
+            for link in tree.find_all(tag):
+                if attr not in link.attrs:
+                    continue
+                link.attrs[attr] = link.attrs[attr].split('#')[0]
+                if not link.attrs[attr]:
+                    continue
+                if ':' in link[attr]:
+                    continue
+                if link[attr].startswith('/'):
+                    target = os.path.join(os.path.abspath(files_top), urllib.parse.unquote(link[attr].lstrip('/')))
+                else:
+                    target = os.path.join(os.path.dirname(fpath), urllib.parse.unquote(link[attr]))
+                if os.path.isdir(target):
+                    pass
+                elif not os.path.exists(target):
+                    print("[{}] -[ error ]-> {} | {}".format(fpath, target, link[attr]))
+                    found_broken += 1
 
     if found_broken:
+        print(f"Found {found_broken} broken links")
         exit(1)
     else:
         exit(0)