#!/usr/bin/env python3 import hashlib import os import argparse import urllib import urllib.request import urllib.parse from bs4 import BeautifulSoup as bs4 USER_AGENT = 'miniarchiver bot' def get_parser(): parser = argparse.ArgumentParser() parser.add_argument('url') return parser def get_extension(path): return (path .split('/')[-1] .split('\\')[-1] .split('.')[-1]) def request(url): req = urllib.request.Request( url, data=None, headers={ 'User-Agent': USER_AGENT, } ) return urllib.request.urlopen(req) def relink_css(content, base_url, directory='styles'): os.makedirs(directory, exist_ok=True) for link in content.find_all('link', rel='stylesheet'): href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False) name = hashlib.sha1(href.encode()).hexdigest() + '.css' path = os.path.join(directory, name) if not os.path.exists(path): content = request(href).read() with open(path, 'wb') as f: f.write(content) link['href'] = path def relink_images(content, base_url, directory='images'): os.makedirs(directory, exist_ok=True) for image in content.find_all('img'): src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False) name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src) path = os.path.join(directory, name) if not os.path.exists(path): content = request(src).read() with open(path, 'wb') as f: f.write(content) image['src'] = path def relink_scripts(content, base_url, directory='scripts'): os.makedirs(directory, exist_ok=True) for script in content.find_all('script'): href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False) name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href) path = os.path.join(directory, name) if not os.path.exists(path): content = request(href).read() with open(path, 'wb') as f: f.write(content) script['src'] = path def relink_links(content, base_url): for link in content.find_all('a'): if 'href' not in link.attrs: continue full_href = urllib.parse.urljoin(base_url, link['href']) link['href'] = full_href for link in content.find_all('link'): if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''): continue full_href = urllib.parse.urljoin(base_url, link['href']) link['href'] = full_href def archive_to_dir(directory, url): current_path = os.getcwd() os.chdir(directory) # Download file base_file = request(url).read() archived_content = bs4(base_file, 'html5lib') # Relink CSS relink_css(archived_content, url) # Relink scripts relink_scripts(archived_content, url) # Relink images relink_images(archived_content, url) # Relink ... links relink_links(archived_content, url) with open('index.html', 'wt') as f: f.write(str(archived_content)) os.chdir(current_path) def main(): args = get_parser().parse_args() os.makedirs('test_ma', exist_ok=True) archive_to_dir('test_ma', args.url) if __name__ == '__main__': main()