#!/usr/bin/env python3 import progress_meter import re import hashlib import os import argparse import urllib import urllib.request import urllib.parse from bs4 import BeautifulSoup as bs4 USER_AGENT = 'miniarchiver bot' ARCHIVE_ROOT = 'archive' DEFAULT_NAME = 'archived_web' ALLOWED_NAMES_RE = re.compile(r'^[- .,:@a-zA-Z0-9]+$') OBJECT_TYPE_DESCRIPTORS = ( ( { 'name': 'link', 'rel': 'stylesheet' }, 'styles', 'href', ), ( { 'name': 'img' }, 'images', 'src', ), ( { 'name': 'script' }, 'scripts', 'src', ), ( { 'name': 'link', 'rel': 'icon' }, 'icons', 'href', ), ) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument('url') parser.add_argument('--name', '-n', default=DEFAULT_NAME) parser.add_argument('--force', '-f', action='store_true') return parser def get_extension(path): return (path .split('/')[-1] .split('\\')[-1] .split('.')[-1]) def request(url): req = urllib.request.Request( url, data=None, headers={ 'User-Agent': USER_AGENT, } ) return urllib.request.urlopen(req) def show_error(e, href=None): print("\r\x1b[K\x1b[41;37m{}\x1b[0m".format(e)) if href is not None: print("Url: {}". format(href)) def archive(content, base_url, selector, directory, attribute, progbar): os.makedirs(directory, exist_ok=True) for part in content.find_all(**selector): if attribute not in part.attrs: continue if part[attribute].startswith('data:'): continue href = urllib.parse.urljoin(base_url, part[attribute], allow_fragments=False) progbar.next_iter(href) name = (hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)) path = os.path.join(directory, name) if not os.path.exists(path): try: content = request(href).read() except Exception as e: show_error(e, href) continue with open(path, 'wb') as f: f.write(content) part[attribute] = path def relink_links(content, base_url): for link in content.find_all('a'): if 'href' not in link.attrs: continue full_href = urllib.parse.urljoin(base_url, link['href']) link['href'] = full_href for link in content.find_all('link'): if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''): continue full_href = urllib.parse.urljoin(base_url, link['href']) link['href'] = full_href def get_num_elements(content): count = 0 for (selector, _, attribute) in OBJECT_TYPE_DESCRIPTORS: count += len([True for element in content.find_all(**selector) if attribute in element.attrs]) return count def archive_to_dir(directory, url): current_path = os.getcwd() os.chdir(directory) # Download file base_file = request(url).read() archived_content = bs4(base_file, 'html5lib') # Relink ... links relink_links(archived_content, url) # Archive objects progbar = progress_meter.ProgressBar(get_num_elements(archived_content)) for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS: archive(archived_content, url, selector, directory, attribute, progbar) with open('index.html', 'wt') as f: f.write(str(archived_content)) os.chdir(current_path) def main(): args = get_parser().parse_args() path = os.path.join(ARCHIVE_ROOT, args.name) if not ALLOWED_NAMES_RE.match(args.name): print(("Only characters 'a-zA-Z0-9', spaces, or '-.,:@'" "are allowed as names.")) return if os.path.exists(path) and not args.force: print(("Archive “{}” already exists, set a new name with '-n '" " or force a overwrite with '-f") .format(args.name)) return os.makedirs(path, exist_ok=True) archive_to_dir(path, args.url) if __name__ == '__main__': main()