miniarchiver/macli.py

#!/usr/bin/env python3

import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4

USER_AGENT = 'miniarchiver bot'


OBJECT_TYPE_DESCRIPTORS = (
    (
        {
            'name': 'link',
            'rel': 'stylesheet'
        },
        'styles',
        'href',
    ),
    (
        {
            'name': 'img'
        },
        'images',
        'src',
    ),
    (
        {
            'name': 'script'
        },
        'scripts',
        'src',
    ),
    (
        {
            'name': 'link',
            'rel': 'icon'
        },
        'icons',
        'href',
    ),
)


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('url')
    return parser


def get_extension(path):
    return (path
            .split('/')[-1]
            .split('\\')[-1]
            .split('.')[-1])


def request(url):
    req = urllib.request.Request(
        url,
        data=None,
        headers={
            'User-Agent': USER_AGENT,
        }
    )
    return urllib.request.urlopen(req)


def archive(content, base_url, selector, directory, attribute):
    os.makedirs(directory, exist_ok=True)
    for part in content.find_all(**selector):
        if attribute not in part.attrs:
            continue

        href = urllib.parse.urljoin(base_url, part[attribute],
                                    allow_fragments=False)

        name = (hashlib.sha1(href.encode()).hexdigest()
                + '.'
                + get_extension(href))

        path = os.path.join(directory, name)
        if not os.path.exists(path):
            content = request(href).read()
            with open(path, 'wb') as f:
                f.write(content)

        part[attribute] = path


def relink_links(content, base_url):
    for link in content.find_all('a'):
        if 'href' not in link.attrs:
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href

    for link in content.find_all('link'):
        if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href


def archive_to_dir(directory, url):
    current_path = os.getcwd()
    os.chdir(directory)
    # Download file
    base_file = request(url).read()
    archived_content = bs4(base_file, 'html5lib')

    # Relink ... links
    relink_links(archived_content, url)

    # Archive objects
    for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
        archive(archived_content, url,
                selector, directory, attribute)

    with open('index.html', 'wt') as f:
        f.write(str(archived_content))

    os.chdir(current_path)


def main():
    args = get_parser().parse_args()
    os.makedirs('test_ma', exist_ok=True)
    archive_to_dir('test_ma', args.url)


if __name__ == '__main__':
    main()