2017-07-04 22:05:01 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2017-07-04 23:21:38 +00:00
|
|
|
import progress_meter
|
2017-07-04 22:55:33 +00:00
|
|
|
import re
|
2017-07-04 22:05:01 +00:00
|
|
|
import hashlib
|
|
|
|
import os
|
|
|
|
import argparse
|
|
|
|
import urllib
|
|
|
|
import urllib.request
|
|
|
|
import urllib.parse
|
|
|
|
from bs4 import BeautifulSoup as bs4
|
|
|
|
|
2017-07-04 22:14:47 +00:00
|
|
|
USER_AGENT = 'miniarchiver bot'
|
2017-07-04 22:55:33 +00:00
|
|
|
ARCHIVE_ROOT = 'archive'
|
|
|
|
DEFAULT_NAME = 'archived_web'
|
2017-07-04 23:04:10 +00:00
|
|
|
ALLOWED_NAMES_RE = re.compile(r'^[- .,:@a-zA-Z0-9]+$')
|
2017-07-04 22:14:47 +00:00
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
OBJECT_TYPE_DESCRIPTORS = (
|
|
|
|
(
|
|
|
|
{
|
|
|
|
'name': 'link',
|
|
|
|
'rel': 'stylesheet'
|
|
|
|
},
|
|
|
|
'styles',
|
|
|
|
'href',
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{
|
|
|
|
'name': 'img'
|
|
|
|
},
|
|
|
|
'images',
|
|
|
|
'src',
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{
|
|
|
|
'name': 'script'
|
|
|
|
},
|
|
|
|
'scripts',
|
|
|
|
'src',
|
|
|
|
),
|
|
|
|
(
|
|
|
|
{
|
|
|
|
'name': 'link',
|
|
|
|
'rel': 'icon'
|
|
|
|
},
|
|
|
|
'icons',
|
|
|
|
'href',
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
def get_parser():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('url')
|
2017-07-04 22:55:33 +00:00
|
|
|
parser.add_argument('--name', '-n', default=DEFAULT_NAME)
|
|
|
|
parser.add_argument('--force', '-f', action='store_true')
|
2017-07-04 22:05:01 +00:00
|
|
|
return parser
|
|
|
|
|
|
|
|
|
|
|
|
def get_extension(path):
|
|
|
|
return (path
|
|
|
|
.split('/')[-1]
|
|
|
|
.split('\\')[-1]
|
|
|
|
.split('.')[-1])
|
|
|
|
|
|
|
|
|
2017-07-04 22:14:47 +00:00
|
|
|
def request(url):
|
|
|
|
req = urllib.request.Request(
|
|
|
|
url,
|
|
|
|
data=None,
|
|
|
|
headers={
|
|
|
|
'User-Agent': USER_AGENT,
|
|
|
|
}
|
|
|
|
)
|
|
|
|
return urllib.request.urlopen(req)
|
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
|
2017-07-04 22:47:03 +00:00
|
|
|
def show_error(e):
|
|
|
|
print("\x1b[41;37m{}\x1b[0m")
|
|
|
|
|
|
|
|
|
2017-07-04 23:21:38 +00:00
|
|
|
def archive(content, base_url, selector, directory, attribute, progbar):
|
2017-07-04 22:05:01 +00:00
|
|
|
os.makedirs(directory, exist_ok=True)
|
2017-07-04 22:39:01 +00:00
|
|
|
for part in content.find_all(**selector):
|
|
|
|
if attribute not in part.attrs:
|
2017-07-04 22:15:43 +00:00
|
|
|
continue
|
|
|
|
|
2021-12-15 19:46:57 +00:00
|
|
|
if part[attribute].startswith('data:'):
|
|
|
|
continue
|
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
href = urllib.parse.urljoin(base_url, part[attribute],
|
|
|
|
allow_fragments=False)
|
2017-07-04 23:21:38 +00:00
|
|
|
progbar.next_iter(href)
|
2017-07-04 22:05:01 +00:00
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
name = (hashlib.sha1(href.encode()).hexdigest()
|
|
|
|
+ '.'
|
|
|
|
+ get_extension(href))
|
2017-07-04 22:05:01 +00:00
|
|
|
|
|
|
|
path = os.path.join(directory, name)
|
|
|
|
if not os.path.exists(path):
|
2017-07-04 22:47:03 +00:00
|
|
|
try:
|
|
|
|
content = request(href).read()
|
|
|
|
except Exception as e:
|
|
|
|
show_error(e)
|
|
|
|
continue
|
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
with open(path, 'wb') as f:
|
|
|
|
f.write(content)
|
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
part[attribute] = path
|
2017-07-04 22:05:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
def relink_links(content, base_url):
|
|
|
|
for link in content.find_all('a'):
|
|
|
|
if 'href' not in link.attrs:
|
|
|
|
continue
|
|
|
|
|
|
|
|
full_href = urllib.parse.urljoin(base_url, link['href'])
|
|
|
|
link['href'] = full_href
|
|
|
|
|
|
|
|
for link in content.find_all('link'):
|
|
|
|
if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
|
|
|
|
continue
|
|
|
|
|
|
|
|
full_href = urllib.parse.urljoin(base_url, link['href'])
|
|
|
|
link['href'] = full_href
|
|
|
|
|
|
|
|
|
2017-07-04 23:21:38 +00:00
|
|
|
def get_num_elements(content):
|
|
|
|
count = 0
|
|
|
|
for (selector, _, attribute) in OBJECT_TYPE_DESCRIPTORS:
|
|
|
|
count += len([True
|
|
|
|
for element
|
|
|
|
in content.find_all(**selector)
|
|
|
|
if attribute in element.attrs])
|
|
|
|
|
|
|
|
return count
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
def archive_to_dir(directory, url):
|
|
|
|
current_path = os.getcwd()
|
|
|
|
os.chdir(directory)
|
|
|
|
# Download file
|
2017-07-04 22:14:47 +00:00
|
|
|
base_file = request(url).read()
|
2017-07-04 22:05:01 +00:00
|
|
|
archived_content = bs4(base_file, 'html5lib')
|
|
|
|
|
|
|
|
# Relink ... links
|
|
|
|
relink_links(archived_content, url)
|
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
# Archive objects
|
2017-07-04 23:21:38 +00:00
|
|
|
progbar = progress_meter.ProgressBar(get_num_elements(archived_content))
|
|
|
|
|
2017-07-04 22:39:01 +00:00
|
|
|
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
|
|
|
|
archive(archived_content, url,
|
2017-07-04 23:21:38 +00:00
|
|
|
selector, directory, attribute, progbar)
|
2017-07-04 22:39:01 +00:00
|
|
|
|
2017-07-04 22:05:01 +00:00
|
|
|
with open('index.html', 'wt') as f:
|
|
|
|
f.write(str(archived_content))
|
|
|
|
|
|
|
|
os.chdir(current_path)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
args = get_parser().parse_args()
|
2017-07-04 22:55:33 +00:00
|
|
|
|
|
|
|
path = os.path.join(ARCHIVE_ROOT, args.name)
|
|
|
|
if not ALLOWED_NAMES_RE.match(args.name):
|
2017-07-04 23:04:10 +00:00
|
|
|
print(("Only characters 'a-zA-Z0-9', spaces, or '-.,:@'"
|
2017-07-04 22:55:33 +00:00
|
|
|
"are allowed as names."))
|
|
|
|
return
|
|
|
|
|
|
|
|
if os.path.exists(path) and not args.force:
|
|
|
|
print(("Archive “{}” already exists, set a new name with '-n <name>'"
|
|
|
|
" or force a overwrite with '-f")
|
|
|
|
.format(args.name))
|
|
|
|
return
|
|
|
|
|
|
|
|
os.makedirs(path, exist_ok=True)
|
|
|
|
archive_to_dir(path, args.url)
|
2017-07-04 22:05:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|