Unify archiving method.
This commit is contained in:
parent
4b3dd00086
commit
3e15c50e22
104
macli.py
104
macli.py
@ -11,6 +11,40 @@ from bs4 import BeautifulSoup as bs4
|
|||||||
USER_AGENT = 'miniarchiver bot'
|
USER_AGENT = 'miniarchiver bot'
|
||||||
|
|
||||||
|
|
||||||
|
OBJECT_TYPE_DESCRIPTORS = (
|
||||||
|
(
|
||||||
|
{
|
||||||
|
'name': 'link',
|
||||||
|
'rel': 'stylesheet'
|
||||||
|
},
|
||||||
|
'styles',
|
||||||
|
'href',
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
'name': 'img'
|
||||||
|
},
|
||||||
|
'images',
|
||||||
|
'src',
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
'name': 'script'
|
||||||
|
},
|
||||||
|
'scripts',
|
||||||
|
'src',
|
||||||
|
),
|
||||||
|
(
|
||||||
|
{
|
||||||
|
'name': 'link',
|
||||||
|
'rel': 'icon'
|
||||||
|
},
|
||||||
|
'icons',
|
||||||
|
'href',
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('url')
|
parser.add_argument('url')
|
||||||
@ -34,55 +68,27 @@ def request(url):
|
|||||||
)
|
)
|
||||||
return urllib.request.urlopen(req)
|
return urllib.request.urlopen(req)
|
||||||
|
|
||||||
def relink_css(content, base_url, directory='styles'):
|
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
for link in content.find_all('link', rel='stylesheet'):
|
|
||||||
href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)
|
|
||||||
|
|
||||||
name = hashlib.sha1(href.encode()).hexdigest() + '.css'
|
def archive(content, base_url, selector, directory, attribute):
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
for part in content.find_all(**selector):
|
||||||
|
if attribute not in part.attrs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
href = urllib.parse.urljoin(base_url, part[attribute],
|
||||||
|
allow_fragments=False)
|
||||||
|
|
||||||
|
name = (hashlib.sha1(href.encode()).hexdigest()
|
||||||
|
+ '.'
|
||||||
|
+ get_extension(href))
|
||||||
|
|
||||||
path = os.path.join(directory, name)
|
path = os.path.join(directory, name)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
content = request(href).read()
|
content = request(href).read()
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
link['href'] = path
|
part[attribute] = path
|
||||||
|
|
||||||
|
|
||||||
def relink_images(content, base_url, directory='images'):
|
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
for image in content.find_all('img'):
|
|
||||||
if 'src' not in image.attrs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)
|
|
||||||
|
|
||||||
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
|
|
||||||
path = os.path.join(directory, name)
|
|
||||||
if not os.path.exists(path):
|
|
||||||
content = request(src).read()
|
|
||||||
with open(path, 'wb') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
image['src'] = path
|
|
||||||
|
|
||||||
|
|
||||||
def relink_scripts(content, base_url, directory='scripts'):
|
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
for script in content.find_all('script'):
|
|
||||||
if 'src' not in script.attrs:
|
|
||||||
continue
|
|
||||||
|
|
||||||
href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)
|
|
||||||
|
|
||||||
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
|
|
||||||
path = os.path.join(directory, name)
|
|
||||||
if not os.path.exists(path):
|
|
||||||
content = request(href).read()
|
|
||||||
with open(path, 'wb') as f:
|
|
||||||
f.write(content)
|
|
||||||
|
|
||||||
script['src'] = path
|
|
||||||
|
|
||||||
|
|
||||||
def relink_links(content, base_url):
|
def relink_links(content, base_url):
|
||||||
@ -108,18 +114,14 @@ def archive_to_dir(directory, url):
|
|||||||
base_file = request(url).read()
|
base_file = request(url).read()
|
||||||
archived_content = bs4(base_file, 'html5lib')
|
archived_content = bs4(base_file, 'html5lib')
|
||||||
|
|
||||||
# Relink CSS
|
|
||||||
relink_css(archived_content, url)
|
|
||||||
|
|
||||||
# Relink scripts
|
|
||||||
relink_scripts(archived_content, url)
|
|
||||||
|
|
||||||
# Relink images
|
|
||||||
relink_images(archived_content, url)
|
|
||||||
|
|
||||||
# Relink ... links
|
# Relink ... links
|
||||||
relink_links(archived_content, url)
|
relink_links(archived_content, url)
|
||||||
|
|
||||||
|
# Archive objects
|
||||||
|
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
|
||||||
|
archive(archived_content, url,
|
||||||
|
selector, directory, attribute)
|
||||||
|
|
||||||
with open('index.html', 'wt') as f:
|
with open('index.html', 'wt') as f:
|
||||||
f.write(str(archived_content))
|
f.write(str(archived_content))
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user