From 3e15c50e227c481adce3d948af173b815e71a46d Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Wed, 5 Jul 2017 00:39:01 +0200 Subject: [PATCH] Unify archiving method. --- macli.py | 104 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/macli.py b/macli.py index e3d595e..95e5ae8 100644 --- a/macli.py +++ b/macli.py @@ -11,6 +11,40 @@ from bs4 import BeautifulSoup as bs4 USER_AGENT = 'miniarchiver bot' +OBJECT_TYPE_DESCRIPTORS = ( + ( + { + 'name': 'link', + 'rel': 'stylesheet' + }, + 'styles', + 'href', + ), + ( + { + 'name': 'img' + }, + 'images', + 'src', + ), + ( + { + 'name': 'script' + }, + 'scripts', + 'src', + ), + ( + { + 'name': 'link', + 'rel': 'icon' + }, + 'icons', + 'href', + ), +) + + def get_parser(): parser = argparse.ArgumentParser() parser.add_argument('url') @@ -34,55 +68,27 @@ def request(url): ) return urllib.request.urlopen(req) -def relink_css(content, base_url, directory='styles'): - os.makedirs(directory, exist_ok=True) - for link in content.find_all('link', rel='stylesheet'): - href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False) - name = hashlib.sha1(href.encode()).hexdigest() + '.css' +def archive(content, base_url, selector, directory, attribute): + os.makedirs(directory, exist_ok=True) + for part in content.find_all(**selector): + if attribute not in part.attrs: + continue + + href = urllib.parse.urljoin(base_url, part[attribute], + allow_fragments=False) + + name = (hashlib.sha1(href.encode()).hexdigest() + + '.' + + get_extension(href)) + path = os.path.join(directory, name) if not os.path.exists(path): content = request(href).read() with open(path, 'wb') as f: f.write(content) - link['href'] = path - - -def relink_images(content, base_url, directory='images'): - os.makedirs(directory, exist_ok=True) - for image in content.find_all('img'): - if 'src' not in image.attrs: - continue - - src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False) - - name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src) - path = os.path.join(directory, name) - if not os.path.exists(path): - content = request(src).read() - with open(path, 'wb') as f: - f.write(content) - - image['src'] = path - - -def relink_scripts(content, base_url, directory='scripts'): - os.makedirs(directory, exist_ok=True) - for script in content.find_all('script'): - if 'src' not in script.attrs: - continue - - href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False) - - name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href) - path = os.path.join(directory, name) - if not os.path.exists(path): - content = request(href).read() - with open(path, 'wb') as f: - f.write(content) - - script['src'] = path + part[attribute] = path def relink_links(content, base_url): @@ -108,18 +114,14 @@ def archive_to_dir(directory, url): base_file = request(url).read() archived_content = bs4(base_file, 'html5lib') - # Relink CSS - relink_css(archived_content, url) - - # Relink scripts - relink_scripts(archived_content, url) - - # Relink images - relink_images(archived_content, url) - # Relink ... links relink_links(archived_content, url) + # Archive objects + for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS: + archive(archived_content, url, + selector, directory, attribute) + with open('index.html', 'wt') as f: f.write(str(archived_content))