miniarchiver/macli.py

139 lines
2.9 KiB
Python

#!/usr/bin/env python3
import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4
USER_AGENT = 'miniarchiver bot'
OBJECT_TYPE_DESCRIPTORS = (
(
{
'name': 'link',
'rel': 'stylesheet'
},
'styles',
'href',
),
(
{
'name': 'img'
},
'images',
'src',
),
(
{
'name': 'script'
},
'scripts',
'src',
),
(
{
'name': 'link',
'rel': 'icon'
},
'icons',
'href',
),
)
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('url')
return parser
def get_extension(path):
return (path
.split('/')[-1]
.split('\\')[-1]
.split('.')[-1])
def request(url):
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': USER_AGENT,
}
)
return urllib.request.urlopen(req)
def archive(content, base_url, selector, directory, attribute):
os.makedirs(directory, exist_ok=True)
for part in content.find_all(**selector):
if attribute not in part.attrs:
continue
href = urllib.parse.urljoin(base_url, part[attribute],
allow_fragments=False)
name = (hashlib.sha1(href.encode()).hexdigest()
+ '.'
+ get_extension(href))
path = os.path.join(directory, name)
if not os.path.exists(path):
content = request(href).read()
with open(path, 'wb') as f:
f.write(content)
part[attribute] = path
def relink_links(content, base_url):
for link in content.find_all('a'):
if 'href' not in link.attrs:
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
for link in content.find_all('link'):
if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
def archive_to_dir(directory, url):
current_path = os.getcwd()
os.chdir(directory)
# Download file
base_file = request(url).read()
archived_content = bs4(base_file, 'html5lib')
# Relink ... links
relink_links(archived_content, url)
# Archive objects
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
archive(archived_content, url,
selector, directory, attribute)
with open('index.html', 'wt') as f:
f.write(str(archived_content))
os.chdir(current_path)
def main():
args = get_parser().parse_args()
os.makedirs('test_ma', exist_ok=True)
archive_to_dir('test_ma', args.url)
if __name__ == '__main__':
main()