From 015b4da5e9b133896a2cccb5fdf583d6f319aac6 Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Wed, 5 Jul 2017 00:05:01 +0200 Subject: [PATCH] Add initial concept. --- .gitignore | 18 ++++++++ macli.py | 118 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + 3 files changed, 138 insertions(+) create mode 100644 .gitignore create mode 100644 macli.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6476120 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +# Temporary editor files +*#* +*~ +*.swp + +# Python temporary files +__pycache__ +*.pyc + +# Cache files +cache/ + +# Build files +dist/ +*.egg-info/ + +# Directories for testing +test_ma diff --git a/macli.py b/macli.py new file mode 100644 index 0000000..cbeb374 --- /dev/null +++ b/macli.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 + +import hashlib +import os +import argparse +import urllib +import urllib.request +import urllib.parse +from bs4 import BeautifulSoup as bs4 + + +def get_parser(): + parser = argparse.ArgumentParser() + parser.add_argument('url') + return parser + + +def get_extension(path): + return (path + .split('/')[-1] + .split('\\')[-1] + .split('.')[-1]) + + +def relink_css(content, base_url, directory='styles'): + os.makedirs(directory, exist_ok=True) + for link in content.find_all('link', rel='stylesheet'): + href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False) + + name = hashlib.sha1(href.encode()).hexdigest() + '.css' + path = os.path.join(directory, name) + if not os.path.exists(path): + content = urllib.request.urlopen(href).read() + with open(path, 'wb') as f: + f.write(content) + + link['href'] = path + + +def relink_images(content, base_url, directory='images'): + os.makedirs(directory, exist_ok=True) + for image in content.find_all('img'): + src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False) + + name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src) + path = os.path.join(directory, name) + if not os.path.exists(path): + content = urllib.request.urlopen(src).read() + with open(path, 'wb') as f: + f.write(content) + + image['src'] = path + + +def relink_scripts(content, base_url, directory='scripts'): + os.makedirs(directory, exist_ok=True) + for script in content.find_all('script'): + href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False) + + name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href) + path = os.path.join(directory, name) + if not os.path.exists(path): + content = urllib.request.urlopen(href).read() + with open(path, 'wb') as f: + f.write(content) + + script['src'] = path + + +def relink_links(content, base_url): + for link in content.find_all('a'): + if 'href' not in link.attrs: + continue + + full_href = urllib.parse.urljoin(base_url, link['href']) + link['href'] = full_href + + for link in content.find_all('link'): + if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''): + continue + + full_href = urllib.parse.urljoin(base_url, link['href']) + link['href'] = full_href + + +def archive_to_dir(directory, url): + current_path = os.getcwd() + os.chdir(directory) + # Download file + base_file = urllib.request.urlopen(url).read() + archived_content = bs4(base_file, 'html5lib') + + # Relink CSS + relink_css(archived_content, url) + + # Relink scripts + relink_scripts(archived_content, url) + + # Relink images + relink_images(archived_content, url) + + # Relink ... links + relink_links(archived_content, url) + + with open('index.html', 'wt') as f: + f.write(str(archived_content)) + + os.chdir(current_path) + + +def main(): + args = get_parser().parse_args() + os.makedirs('test_ma', exist_ok=True) + archive_to_dir('test_ma', args.url) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2ec7313 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bs4 +html5lib