miniarchiver/macli.py

192 lines
4.5 KiB
Python
Raw Normal View History

2017-07-04 22:05:01 +00:00
#!/usr/bin/env python3
2017-07-04 23:21:38 +00:00
import progress_meter
import re
2017-07-04 22:05:01 +00:00
import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4
2017-07-04 22:14:47 +00:00
USER_AGENT = 'miniarchiver bot'
ARCHIVE_ROOT = 'archive'
DEFAULT_NAME = 'archived_web'
2017-07-04 23:04:10 +00:00
ALLOWED_NAMES_RE = re.compile(r'^[- .,:@a-zA-Z0-9]+$')
2017-07-04 22:14:47 +00:00
2017-07-04 22:05:01 +00:00
2017-07-04 22:39:01 +00:00
OBJECT_TYPE_DESCRIPTORS = (
(
{
'name': 'link',
'rel': 'stylesheet'
},
'styles',
'href',
),
(
{
'name': 'img'
},
'images',
'src',
),
(
{
'name': 'script'
},
'scripts',
'src',
),
(
{
'name': 'link',
'rel': 'icon'
},
'icons',
'href',
),
)
2017-07-04 22:05:01 +00:00
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('url')
parser.add_argument('--name', '-n', default=DEFAULT_NAME)
parser.add_argument('--force', '-f', action='store_true')
2017-07-04 22:05:01 +00:00
return parser
2021-12-17 08:13:09 +00:00
def get_filename(path):
2017-07-04 22:05:01 +00:00
return (path
2021-12-17 08:13:09 +00:00
.split('?')[0]
2017-07-04 22:05:01 +00:00
.split('/')[-1]
2021-12-17 08:13:09 +00:00
.split('\\')[-1])
2017-07-04 22:05:01 +00:00
2021-12-17 08:13:09 +00:00
def get_extension(path):
return (get_filename(path)
.split('.')[-1])
2017-07-04 22:05:01 +00:00
2017-07-04 22:14:47 +00:00
def request(url):
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': USER_AGENT,
}
)
return urllib.request.urlopen(req)
2017-07-04 22:05:01 +00:00
2021-12-15 19:52:47 +00:00
def show_error(e, href=None):
print("\r\x1b[K\x1b[41;37m{}\x1b[0m".format(e))
if href is not None:
print("Url: {}". format(href))
2017-07-04 22:47:03 +00:00
2017-07-04 23:21:38 +00:00
def archive(content, base_url, selector, directory, attribute, progbar):
2017-07-04 22:39:01 +00:00
for part in content.find_all(**selector):
if attribute not in part.attrs:
continue
2021-12-15 19:46:57 +00:00
if part[attribute].startswith('data:'):
continue
2017-07-04 22:39:01 +00:00
href = urllib.parse.urljoin(base_url, part[attribute],
allow_fragments=False)
2017-07-04 23:21:38 +00:00
progbar.next_iter(href)
2017-07-04 22:05:01 +00:00
2017-07-04 22:39:01 +00:00
name = (hashlib.sha1(href.encode()).hexdigest()
2021-12-17 08:13:09 +00:00
+ '/'
+ get_filename(part[attribute]))
2017-07-04 22:05:01 +00:00
path = os.path.join(directory, name)
2021-12-17 08:13:09 +00:00
os.makedirs(os.path.dirname(path), exist_ok=True)
2017-07-04 22:05:01 +00:00
if not os.path.exists(path):
2017-07-04 22:47:03 +00:00
try:
content = request(href).read()
except Exception as e:
2021-12-15 19:52:47 +00:00
show_error(e, href)
2017-07-04 22:47:03 +00:00
continue
2017-07-04 22:05:01 +00:00
with open(path, 'wb') as f:
f.write(content)
2017-07-04 22:39:01 +00:00
part[attribute] = path
2017-07-04 22:05:01 +00:00
def relink_links(content, base_url):
for link in content.find_all('a'):
if 'href' not in link.attrs:
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
for link in content.find_all('link'):
if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
2017-07-04 23:21:38 +00:00
def get_num_elements(content):
count = 0
for (selector, _, attribute) in OBJECT_TYPE_DESCRIPTORS:
count += len([True
for element
in content.find_all(**selector)
if attribute in element.attrs])
return count
2017-07-04 22:05:01 +00:00
def archive_to_dir(directory, url):
current_path = os.getcwd()
os.chdir(directory)
# Download file
2017-07-04 22:14:47 +00:00
base_file = request(url).read()
2017-07-04 22:05:01 +00:00
archived_content = bs4(base_file, 'html5lib')
# Relink ... links
relink_links(archived_content, url)
2017-07-04 22:39:01 +00:00
# Archive objects
2017-07-04 23:21:38 +00:00
progbar = progress_meter.ProgressBar(get_num_elements(archived_content))
2017-07-04 22:39:01 +00:00
for (selector, directory, attribute) in OBJECT_TYPE_DESCRIPTORS:
archive(archived_content, url,
2017-07-04 23:21:38 +00:00
selector, directory, attribute, progbar)
2017-07-04 22:39:01 +00:00
2017-07-04 22:05:01 +00:00
with open('index.html', 'wt') as f:
f.write(str(archived_content))
os.chdir(current_path)
def main():
args = get_parser().parse_args()
path = os.path.join(ARCHIVE_ROOT, args.name)
if not ALLOWED_NAMES_RE.match(args.name):
2017-07-04 23:04:10 +00:00
print(("Only characters 'a-zA-Z0-9', spaces, or '-.,:@'"
"are allowed as names."))
return
if os.path.exists(path) and not args.force:
print(("Archive “{}” already exists, set a new name with '-n <name>'"
" or force a overwrite with '-f")
.format(args.name))
return
os.makedirs(path, exist_ok=True)
archive_to_dir(path, args.url)
2017-07-04 22:05:01 +00:00
if __name__ == '__main__':
main()