Add initial concept.

2017-07-05 00:05:01 +02:00 · 2017-07-05 00:05:01 +02:00 · 015b4da5e9
commit 015b4da5e9
3 changed files with 138 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,18 @@
+# Temporary editor files
+*#*
+*~
+*.swp
+
+# Python temporary files
+__pycache__
+*.pyc
+
+# Cache files
+cache/
+
+# Build files
+dist/
+*.egg-info/
+
+# Directories for testing
+test_ma
--- a/macli.py
+++ b/macli.py
@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+
+import hashlib
+import os
+import argparse
+import urllib
+import urllib.request
+import urllib.parse
+from bs4 import BeautifulSoup as bs4
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('url')
+    return parser
+
+
+def get_extension(path):
+    return (path
+            .split('/')[-1]
+            .split('\\')[-1]
+            .split('.')[-1])
+
+
+def relink_css(content, base_url, directory='styles'):
+    os.makedirs(directory, exist_ok=True)
+    for link in content.find_all('link', rel='stylesheet'):
+        href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)
+
+        name = hashlib.sha1(href.encode()).hexdigest() + '.css'
+        path = os.path.join(directory, name)
+        if not os.path.exists(path):
+            content = urllib.request.urlopen(href).read()
+            with open(path, 'wb') as f:
+                f.write(content)
+
+        link['href'] = path
+
+
+def relink_images(content, base_url, directory='images'):
+    os.makedirs(directory, exist_ok=True)
+    for image in content.find_all('img'):
+        src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)
+
+        name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
+        path = os.path.join(directory, name)
+        if not os.path.exists(path):
+            content = urllib.request.urlopen(src).read()
+            with open(path, 'wb') as f:
+                f.write(content)
+
+        image['src'] = path
+
+
+def relink_scripts(content, base_url, directory='scripts'):
+    os.makedirs(directory, exist_ok=True)
+    for script in content.find_all('script'):
+        href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)
+
+        name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
+        path = os.path.join(directory, name)
+        if not os.path.exists(path):
+            content = urllib.request.urlopen(href).read()
+            with open(path, 'wb') as f:
+                f.write(content)
+
+        script['src'] = path
+
+
+def relink_links(content, base_url):
+    for link in content.find_all('a'):
+        if 'href' not in link.attrs:
+            continue
+
+        full_href = urllib.parse.urljoin(base_url, link['href'])
+        link['href'] = full_href
+
+    for link in content.find_all('link'):
+        if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
+            continue
+
+        full_href = urllib.parse.urljoin(base_url, link['href'])
+        link['href'] = full_href
+
+
+def archive_to_dir(directory, url):
+    current_path = os.getcwd()
+    os.chdir(directory)
+    # Download file
+    base_file = urllib.request.urlopen(url).read()
+    archived_content = bs4(base_file, 'html5lib')
+
+    # Relink CSS
+    relink_css(archived_content, url)
+
+    # Relink scripts
+    relink_scripts(archived_content, url)
+
+    # Relink images
+    relink_images(archived_content, url)
+
+    # Relink ... links
+    relink_links(archived_content, url)
+
+    with open('index.html', 'wt') as f:
+        f.write(str(archived_content))
+
+    os.chdir(current_path)
+
+
+def main():
+    args = get_parser().parse_args()
+    os.makedirs('test_ma', exist_ok=True)
+    archive_to_dir('test_ma', args.url)
+
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+bs4
+html5lib