Unify archiving method.

This commit is contained in:
kenkeiras 2017-07-05 00:39:01 +02:00
parent 4b3dd00086
commit 3e15c50e22

104
macli.py
View File

@ -11,6 +11,40 @@ from bs4 import BeautifulSoup as bs4
USER_AGENT = 'miniarchiver bot' USER_AGENT = 'miniarchiver bot'
OBJECT_TYPE_DESCRIPTORS = (
(
{
'name': 'link',
'rel': 'stylesheet'
},
'styles',
'href',
),
(
{
'name': 'img'
},
'images',
'src',
),
(
{
'name': 'script'
},
'scripts',
'src',
),
(
{
'name': 'link',
'rel': 'icon'
},
'icons',
'href',
),
)
def get_parser(): def get_parser():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('url') parser.add_argument('url')
@ -34,55 +68,27 @@ def request(url):
) )
return urllib.request.urlopen(req) return urllib.request.urlopen(req)
def relink_css(content, base_url, directory='styles'):
os.makedirs(directory, exist_ok=True)
for link in content.find_all('link', rel='stylesheet'):
href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)
name = hashlib.sha1(href.encode()).hexdigest() + '.css' def archive(content, base_url, selector, directory, attribute):
os.makedirs(directory, exist_ok=True)
for part in content.find_all(**selector):
if attribute not in part.attrs:
continue
href = urllib.parse.urljoin(base_url, part[attribute],
allow_fragments=False)
name = (hashlib.sha1(href.encode()).hexdigest()
+ '.'
+ get_extension(href))
path = os.path.join(directory, name) path = os.path.join(directory, name)
if not os.path.exists(path): if not os.path.exists(path):
content = request(href).read() content = request(href).read()
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(content) f.write(content)
link['href'] = path part[attribute] = path
def relink_images(content, base_url, directory='images'):
os.makedirs(directory, exist_ok=True)
for image in content.find_all('img'):
if 'src' not in image.attrs:
continue
src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
path = os.path.join(directory, name)
if not os.path.exists(path):
content = request(src).read()
with open(path, 'wb') as f:
f.write(content)
image['src'] = path
def relink_scripts(content, base_url, directory='scripts'):
os.makedirs(directory, exist_ok=True)
for script in content.find_all('script'):
if 'src' not in script.attrs:
continue
href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
path = os.path.join(directory, name)
if not os.path.exists(path):
content = request(href).read()
with open(path, 'wb') as f:
f.write(content)
script['src'] = path
def relink_links(content, base_url): def relink_links(content, base_url):
@ -108,18 +114,14 @@ def archive_to_dir(directory, url):
base_file = request(url).read() base_file = request(url).read()
archived_content = bs4(base_file, 'html5lib') archived_content = bs4(base_file, 'html5lib')
# Relink CSS
relink_css(archived_content, url)
# Relink scripts
relink_scripts(archived_content, url)
# Relink images
relink_images(archived_content, url)
# Relink ... links # Relink ... links
relink_links(archived_content, url) relink_links(archived_content, url)
# Archive objects
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
archive(archived_content, url,
selector, directory, attribute)
with open('index.html', 'wt') as f: with open('index.html', 'wt') as f:
f.write(str(archived_content)) f.write(str(archived_content))