Unify archiving method.
This commit is contained in:
parent
4b3dd00086
commit
3e15c50e22
104
macli.py
104
macli.py
@ -11,6 +11,40 @@ from bs4 import BeautifulSoup as bs4
|
||||
USER_AGENT = 'miniarchiver bot'
|
||||
|
||||
|
||||
OBJECT_TYPE_DESCRIPTORS = (
|
||||
(
|
||||
{
|
||||
'name': 'link',
|
||||
'rel': 'stylesheet'
|
||||
},
|
||||
'styles',
|
||||
'href',
|
||||
),
|
||||
(
|
||||
{
|
||||
'name': 'img'
|
||||
},
|
||||
'images',
|
||||
'src',
|
||||
),
|
||||
(
|
||||
{
|
||||
'name': 'script'
|
||||
},
|
||||
'scripts',
|
||||
'src',
|
||||
),
|
||||
(
|
||||
{
|
||||
'name': 'link',
|
||||
'rel': 'icon'
|
||||
},
|
||||
'icons',
|
||||
'href',
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('url')
|
||||
@ -34,55 +68,27 @@ def request(url):
|
||||
)
|
||||
return urllib.request.urlopen(req)
|
||||
|
||||
def relink_css(content, base_url, directory='styles'):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
for link in content.find_all('link', rel='stylesheet'):
|
||||
href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)
|
||||
|
||||
name = hashlib.sha1(href.encode()).hexdigest() + '.css'
|
||||
def archive(content, base_url, selector, directory, attribute):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
for part in content.find_all(**selector):
|
||||
if attribute not in part.attrs:
|
||||
continue
|
||||
|
||||
href = urllib.parse.urljoin(base_url, part[attribute],
|
||||
allow_fragments=False)
|
||||
|
||||
name = (hashlib.sha1(href.encode()).hexdigest()
|
||||
+ '.'
|
||||
+ get_extension(href))
|
||||
|
||||
path = os.path.join(directory, name)
|
||||
if not os.path.exists(path):
|
||||
content = request(href).read()
|
||||
with open(path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
link['href'] = path
|
||||
|
||||
|
||||
def relink_images(content, base_url, directory='images'):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
for image in content.find_all('img'):
|
||||
if 'src' not in image.attrs:
|
||||
continue
|
||||
|
||||
src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)
|
||||
|
||||
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
|
||||
path = os.path.join(directory, name)
|
||||
if not os.path.exists(path):
|
||||
content = request(src).read()
|
||||
with open(path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
image['src'] = path
|
||||
|
||||
|
||||
def relink_scripts(content, base_url, directory='scripts'):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
for script in content.find_all('script'):
|
||||
if 'src' not in script.attrs:
|
||||
continue
|
||||
|
||||
href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)
|
||||
|
||||
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
|
||||
path = os.path.join(directory, name)
|
||||
if not os.path.exists(path):
|
||||
content = request(href).read()
|
||||
with open(path, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
script['src'] = path
|
||||
part[attribute] = path
|
||||
|
||||
|
||||
def relink_links(content, base_url):
|
||||
@ -108,18 +114,14 @@ def archive_to_dir(directory, url):
|
||||
base_file = request(url).read()
|
||||
archived_content = bs4(base_file, 'html5lib')
|
||||
|
||||
# Relink CSS
|
||||
relink_css(archived_content, url)
|
||||
|
||||
# Relink scripts
|
||||
relink_scripts(archived_content, url)
|
||||
|
||||
# Relink images
|
||||
relink_images(archived_content, url)
|
||||
|
||||
# Relink ... links
|
||||
relink_links(archived_content, url)
|
||||
|
||||
# Archive objects
|
||||
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
|
||||
archive(archived_content, url,
|
||||
selector, directory, attribute)
|
||||
|
||||
with open('index.html', 'wt') as f:
|
||||
f.write(str(archived_content))
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user