miniarchiver/macli.py

#!/usr/bin/env python3

import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4

USER_AGENT = 'miniarchiver bot'


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('url')
    return parser


def get_extension(path):
    return (path
            .split('/')[-1]
            .split('\\')[-1]
            .split('.')[-1])


def request(url):
    req = urllib.request.Request(
        url,
        data=None,
        headers={
            'User-Agent': USER_AGENT,
        }
    )
    return urllib.request.urlopen(req)

def relink_css(content, base_url, directory='styles'):
    os.makedirs(directory, exist_ok=True)
    for link in content.find_all('link', rel='stylesheet'):
        href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)

        name = hashlib.sha1(href.encode()).hexdigest() + '.css'
        path = os.path.join(directory, name)
        if not os.path.exists(path):
            content = request(href).read()
            with open(path, 'wb') as f:
                f.write(content)

        link['href'] = path


def relink_images(content, base_url, directory='images'):
    os.makedirs(directory, exist_ok=True)
    for image in content.find_all('img'):
        if 'src' not in image.attrs:
            continue

        src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)

        name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
        path = os.path.join(directory, name)
        if not os.path.exists(path):
            content = request(src).read()
            with open(path, 'wb') as f:
                f.write(content)

        image['src'] = path


def relink_scripts(content, base_url, directory='scripts'):
    os.makedirs(directory, exist_ok=True)
    for script in content.find_all('script'):
        if 'src' not in script.attrs:
            continue

        href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)

        name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
        path = os.path.join(directory, name)
        if not os.path.exists(path):
            content = request(href).read()
            with open(path, 'wb') as f:
                f.write(content)

        script['src'] = path


def relink_links(content, base_url):
    for link in content.find_all('a'):
        if 'href' not in link.attrs:
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href

    for link in content.find_all('link'):
        if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href


def archive_to_dir(directory, url):
    current_path = os.getcwd()
    os.chdir(directory)
    # Download file
    base_file = request(url).read()
    archived_content = bs4(base_file, 'html5lib')

    # Relink CSS
    relink_css(archived_content, url)

    # Relink scripts
    relink_scripts(archived_content, url)

    # Relink images
    relink_images(archived_content, url)

    # Relink ... links
    relink_links(archived_content, url)

    with open('index.html', 'wt') as f:
        f.write(str(archived_content))

    os.chdir(current_path)


def main():
    args = get_parser().parse_args()
    os.makedirs('test_ma', exist_ok=True)
    archive_to_dir('test_ma', args.url)


if __name__ == '__main__':
    main()
Add initial concept. 2017-07-05 00:05:01 +02:00			`#!/usr/bin/env python3`

			`import hashlib`
			`import os`
			`import argparse`
			`import urllib`
			`import urllib.request`
			`import urllib.parse`
			`from bs4 import BeautifulSoup as bs4`

Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`USER_AGENT = 'miniarchiver bot'`

Add initial concept. 2017-07-05 00:05:01 +02:00
			`def get_parser():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('url')`
			`return parser`


			`def get_extension(path):`
			`return (path`
			`.split('/')[-1]`
			`.split('\\')[-1]`
			`.split('.')[-1])`


Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`def request(url):`
			`req = urllib.request.Request(`
			`url,`
			`data=None,`
			`headers={`
			`'User-Agent': USER_AGENT,`
			`}`
			`)`
			`return urllib.request.urlopen(req)`

Add initial concept. 2017-07-05 00:05:01 +02:00			`def relink_css(content, base_url, directory='styles'):`
			`os.makedirs(directory, exist_ok=True)`
			`for link in content.find_all('link', rel='stylesheet'):`
			`href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)`

			`name = hashlib.sha1(href.encode()).hexdigest() + '.css'`
			`path = os.path.join(directory, name)`
			`if not os.path.exists(path):`
Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`content = request(href).read()`
Add initial concept. 2017-07-05 00:05:01 +02:00			`with open(path, 'wb') as f:`
			`f.write(content)`

			`link['href'] = path`


			`def relink_images(content, base_url, directory='images'):`
			`os.makedirs(directory, exist_ok=True)`
			`for image in content.find_all('img'):`
Do more checks before jumping to relinking. 2017-07-05 00:15:43 +02:00			`if 'src' not in image.attrs:`
			`continue`

Add initial concept. 2017-07-05 00:05:01 +02:00			`src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)`

			`name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)`
			`path = os.path.join(directory, name)`
			`if not os.path.exists(path):`
Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`content = request(src).read()`
Add initial concept. 2017-07-05 00:05:01 +02:00			`with open(path, 'wb') as f:`
			`f.write(content)`

			`image['src'] = path`


			`def relink_scripts(content, base_url, directory='scripts'):`
			`os.makedirs(directory, exist_ok=True)`
			`for script in content.find_all('script'):`
Do more checks before jumping to relinking. 2017-07-05 00:15:43 +02:00			`if 'src' not in script.attrs:`
			`continue`

Add initial concept. 2017-07-05 00:05:01 +02:00			`href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)`

			`name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)`
			`path = os.path.join(directory, name)`
			`if not os.path.exists(path):`
Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`content = request(href).read()`
Add initial concept. 2017-07-05 00:05:01 +02:00			`with open(path, 'wb') as f:`
			`f.write(content)`

			`script['src'] = path`


			`def relink_links(content, base_url):`
			`for link in content.find_all('a'):`
			`if 'href' not in link.attrs:`
			`continue`

			`full_href = urllib.parse.urljoin(base_url, link['href'])`
			`link['href'] = full_href`

			`for link in content.find_all('link'):`
			`if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):`
			`continue`

			`full_href = urllib.parse.urljoin(base_url, link['href'])`
			`link['href'] = full_href`


			`def archive_to_dir(directory, url):`
			`current_path = os.getcwd()`
			`os.chdir(directory)`
			`# Download file`
Use a custom user-agent. 2017-07-05 00:14:47 +02:00			`base_file = request(url).read()`
Add initial concept. 2017-07-05 00:05:01 +02:00			`archived_content = bs4(base_file, 'html5lib')`

			`# Relink CSS`
			`relink_css(archived_content, url)`

			`# Relink scripts`
			`relink_scripts(archived_content, url)`

			`# Relink images`
			`relink_images(archived_content, url)`

			`# Relink ... links`
			`relink_links(archived_content, url)`

			`with open('index.html', 'wt') as f:`
			`f.write(str(archived_content))`

			`os.chdir(current_path)`


			`def main():`
			`args = get_parser().parse_args()`
			`os.makedirs('test_ma', exist_ok=True)`
			`archive_to_dir('test_ma', args.url)`


			`if __name__ == '__main__':`
			`main()`