miniarchiver/macli.py

#!/usr/bin/env python3

import progress_meter
import re
import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4

USER_AGENT = 'miniarchiver bot'
ARCHIVE_ROOT = 'archive'
DEFAULT_NAME = 'archived_web'
ALLOWED_NAMES_RE = re.compile(r'^[- .,:@a-zA-Z0-9]+$')


OBJECT_TYPE_DESCRIPTORS = (
    (
        {
            'name': 'link',
            'rel': 'stylesheet'
        },
        'styles',
        'href',
    ),
    (
        {
            'name': 'img'
        },
        'images',
        'src',
    ),
    (
        {
            'name': 'script'
        },
        'scripts',
        'src',
    ),
    (
        {
            'name': 'link',
            'rel': 'icon'
        },
        'icons',
        'href',
    ),
)


def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('url')
    parser.add_argument('--name', '-n', default=DEFAULT_NAME)
    parser.add_argument('--force', '-f', action='store_true')
    return parser


def get_filename(path):
    return (path
            .split('?')[0]
            .split('/')[-1]
            .split('\\')[-1])

def get_extension(path):
    return (get_filename(path)
            .split('.')[-1])

def request(url):
    req = urllib.request.Request(
        url,
        data=None,
        headers={
            'User-Agent': USER_AGENT,
        }
    )
    return urllib.request.urlopen(req)


def show_error(e, href=None):
    print("\r\x1b[K\x1b[41;37m{}\x1b[0m".format(e))
    if href is not None:
        print("Url: {}". format(href))


def archive(content, base_url, selector, directory, attribute, progbar):
    for part in content.find_all(**selector):
        if attribute not in part.attrs:
            continue

        if part[attribute].startswith('data:'):
            continue

        href = urllib.parse.urljoin(base_url, part[attribute],
                                    allow_fragments=False)
        progbar.next_iter(href)

        name = (hashlib.sha1(href.encode()).hexdigest()
                + '/'
                + get_filename(part[attribute]))

        path = os.path.join(directory, name)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        if not os.path.exists(path):
            try:
                content = request(href).read()
            except Exception as e:
                show_error(e, href)
                continue

            with open(path, 'wb') as f:
                f.write(content)

        part[attribute] = path


def relink_links(content, base_url):
    for link in content.find_all('a'):
        if 'href' not in link.attrs:
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href

    for link in content.find_all('link'):
        if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
            continue

        full_href = urllib.parse.urljoin(base_url, link['href'])
        link['href'] = full_href


def get_num_elements(content):
    count = 0
    for (selector, _, attribute) in OBJECT_TYPE_DESCRIPTORS:
        count += len([True
                      for element
                      in content.find_all(**selector)
                      if attribute in element.attrs])

    return count


def archive_to_dir(directory, url):
    current_path = os.getcwd()
    os.chdir(directory)
    # Download file
    base_file = request(url).read()
    archived_content = bs4(base_file, 'html5lib')

    # Relink ... links
    relink_links(archived_content, url)

    # Archive objects
    progbar = progress_meter.ProgressBar(get_num_elements(archived_content))

    for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
        archive(archived_content, url,
                selector, directory, attribute, progbar)

    with open('index.html', 'wt') as f:
        f.write(str(archived_content))

    os.chdir(current_path)


def main():
    args = get_parser().parse_args()

    path = os.path.join(ARCHIVE_ROOT, args.name)
    if not ALLOWED_NAMES_RE.match(args.name):
        print(("Only characters 'a-zA-Z0-9', spaces, or '-.,:@'"
               "are allowed as names."))
        return

    if os.path.exists(path) and not args.force:
        print(("Archive “{}” already exists, set a new name with '-n <name>'"
               " or force a overwrite with '-f")
              .format(args.name))
        return

    os.makedirs(path, exist_ok=True)
    archive_to_dir(path, args.url)


if __name__ == '__main__':
    main()
Add initial concept. 2017-07-04 22:05:01 +00:00			`#!/usr/bin/env python3`

Add visual progress bar. 2017-07-04 23:21:38 +00:00			`import progress_meter`
Use folders to store archives of different sites. 2017-07-04 22:55:33 +00:00			`import re`
Add initial concept. 2017-07-04 22:05:01 +00:00			`import hashlib`
			`import os`
			`import argparse`
			`import urllib`
			`import urllib.request`
			`import urllib.parse`
			`from bs4 import BeautifulSoup as bs4`

Use a custom user-agent. 2017-07-04 22:14:47 +00:00			`USER_AGENT = 'miniarchiver bot'`
Use folders to store archives of different sites. 2017-07-04 22:55:33 +00:00			`ARCHIVE_ROOT = 'archive'`
			`DEFAULT_NAME = 'archived_web'`
Allow more characters on names. 2017-07-04 23:04:10 +00:00			`ALLOWED_NAMES_RE = re.compile(r'^[- .,:@a-zA-Z0-9]+$')`
Use a custom user-agent. 2017-07-04 22:14:47 +00:00
Add initial concept. 2017-07-04 22:05:01 +00:00
Unify archiving method. 2017-07-04 22:39:01 +00:00			`OBJECT_TYPE_DESCRIPTORS = (`
			`(`
			`{`
			`'name': 'link',`
			`'rel': 'stylesheet'`
			`},`
			`'styles',`
			`'href',`
			`),`
			`(`
			`{`
			`'name': 'img'`
			`},`
			`'images',`
			`'src',`
			`),`
			`(`
			`{`
			`'name': 'script'`
			`},`
			`'scripts',`
			`'src',`
			`),`
			`(`
			`{`
			`'name': 'link',`
			`'rel': 'icon'`
			`},`
			`'icons',`
			`'href',`
			`),`
			`)`


Add initial concept. 2017-07-04 22:05:01 +00:00			`def get_parser():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('url')`
Use folders to store archives of different sites. 2017-07-04 22:55:33 +00:00			`parser.add_argument('--name', '-n', default=DEFAULT_NAME)`
			`parser.add_argument('--force', '-f', action='store_true')`
Add initial concept. 2017-07-04 22:05:01 +00:00			`return parser`


Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`def get_filename(path):`
Add initial concept. 2017-07-04 22:05:01 +00:00			`return (path`
Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`.split('?')[0]`
Add initial concept. 2017-07-04 22:05:01 +00:00			`.split('/')[-1]`
Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`.split('\\')[-1])`
Add initial concept. 2017-07-04 22:05:01 +00:00
Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`def get_extension(path):`
			`return (get_filename(path)`
			`.split('.')[-1])`
Add initial concept. 2017-07-04 22:05:01 +00:00
Use a custom user-agent. 2017-07-04 22:14:47 +00:00			`def request(url):`
			`req = urllib.request.Request(`
			`url,`
			`data=None,`
			`headers={`
			`'User-Agent': USER_AGENT,`
			`}`
			`)`
			`return urllib.request.urlopen(req)`

Add initial concept. 2017-07-04 22:05:01 +00:00
Improve error handling. 2021-12-15 19:52:47 +00:00			`def show_error(e, href=None):`
			`print("\r\x1b[K\x1b[41;37m{}\x1b[0m".format(e))`
			`if href is not None:`
			`print("Url: {}". format(href))`

Add rudimentary error reporting. 2017-07-04 22:47:03 +00:00

Add visual progress bar. 2017-07-04 23:21:38 +00:00			`def archive(content, base_url, selector, directory, attribute, progbar):`
Unify archiving method. 2017-07-04 22:39:01 +00:00			`for part in content.find_all(**selector):`
			`if attribute not in part.attrs:`
Do more checks before jumping to relinking. 2017-07-04 22:15:43 +00:00			`continue`

Handle "data:" URLs. 2021-12-15 19:46:57 +00:00			`if part[attribute].startswith('data:'):`
			`continue`

Unify archiving method. 2017-07-04 22:39:01 +00:00			`href = urllib.parse.urljoin(base_url, part[attribute],`
			`allow_fragments=False)`
Add visual progress bar. 2017-07-04 23:21:38 +00:00			`progbar.next_iter(href)`
Add initial concept. 2017-07-04 22:05:01 +00:00
Unify archiving method. 2017-07-04 22:39:01 +00:00			`name = (hashlib.sha1(href.encode()).hexdigest()`
Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`+ '/'`
			`+ get_filename(part[attribute]))`
Add initial concept. 2017-07-04 22:05:01 +00:00
			`path = os.path.join(directory, name)`
Keep archived file names consistent. 2021-12-17 08:13:09 +00:00			`os.makedirs(os.path.dirname(path), exist_ok=True)`
Add initial concept. 2017-07-04 22:05:01 +00:00			`if not os.path.exists(path):`
Add rudimentary error reporting. 2017-07-04 22:47:03 +00:00			`try:`
			`content = request(href).read()`
			`except Exception as e:`
Improve error handling. 2021-12-15 19:52:47 +00:00			`show_error(e, href)`
Add rudimentary error reporting. 2017-07-04 22:47:03 +00:00			`continue`

Add initial concept. 2017-07-04 22:05:01 +00:00			`with open(path, 'wb') as f:`
			`f.write(content)`

Unify archiving method. 2017-07-04 22:39:01 +00:00			`part[attribute] = path`
Add initial concept. 2017-07-04 22:05:01 +00:00

			`def relink_links(content, base_url):`
			`for link in content.find_all('a'):`
			`if 'href' not in link.attrs:`
			`continue`

			`full_href = urllib.parse.urljoin(base_url, link['href'])`
			`link['href'] = full_href`

			`for link in content.find_all('link'):`
			`if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):`
			`continue`

			`full_href = urllib.parse.urljoin(base_url, link['href'])`
			`link['href'] = full_href`


Add visual progress bar. 2017-07-04 23:21:38 +00:00			`def get_num_elements(content):`
			`count = 0`
			`for (selector, _, attribute) in OBJECT_TYPE_DESCRIPTORS:`
			`count += len([True`
			`for element`
			`in content.find_all(**selector)`
			`if attribute in element.attrs])`

			`return count`



Add initial concept. 2017-07-04 22:05:01 +00:00			`def archive_to_dir(directory, url):`
			`current_path = os.getcwd()`
			`os.chdir(directory)`
			`# Download file`
Use a custom user-agent. 2017-07-04 22:14:47 +00:00			`base_file = request(url).read()`
Add initial concept. 2017-07-04 22:05:01 +00:00			`archived_content = bs4(base_file, 'html5lib')`

			`# Relink ... links`
			`relink_links(archived_content, url)`

Unify archiving method. 2017-07-04 22:39:01 +00:00			`# Archive objects`
Add visual progress bar. 2017-07-04 23:21:38 +00:00			`progbar = progress_meter.ProgressBar(get_num_elements(archived_content))`

Unify archiving method. 2017-07-04 22:39:01 +00:00			`for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:`
			`archive(archived_content, url,`
Add visual progress bar. 2017-07-04 23:21:38 +00:00			`selector, directory, attribute, progbar)`
Unify archiving method. 2017-07-04 22:39:01 +00:00
Add initial concept. 2017-07-04 22:05:01 +00:00			`with open('index.html', 'wt') as f:`
			`f.write(str(archived_content))`

			`os.chdir(current_path)`


			`def main():`
			`args = get_parser().parse_args()`
Use folders to store archives of different sites. 2017-07-04 22:55:33 +00:00
			`path = os.path.join(ARCHIVE_ROOT, args.name)`
			`if not ALLOWED_NAMES_RE.match(args.name):`
Allow more characters on names. 2017-07-04 23:04:10 +00:00			`print(("Only characters 'a-zA-Z0-9', spaces, or '-.,:@'"`
Use folders to store archives of different sites. 2017-07-04 22:55:33 +00:00			`"are allowed as names."))`
			`return`

			`if os.path.exists(path) and not args.force:`
			`print(("Archive “{}” already exists, set a new name with '-n <name>'"`
			`" or force a overwrite with '-f")`
			`.format(args.name))`
			`return`

			`os.makedirs(path, exist_ok=True)`
			`archive_to_dir(path, args.url)`
Add initial concept. 2017-07-04 22:05:01 +00:00

			`if __name__ == '__main__':`
			`main()`