Add initial concept.

This commit is contained in:
kenkeiras 2017-07-05 00:05:01 +02:00
commit 015b4da5e9
3 changed files with 138 additions and 0 deletions

18
.gitignore vendored Normal file
View File

@ -0,0 +1,18 @@
# Temporary editor files
*#*
*~
*.swp
# Python temporary files
__pycache__
*.pyc
# Cache files
cache/
# Build files
dist/
*.egg-info/
# Directories for testing
test_ma

118
macli.py Normal file
View File

@ -0,0 +1,118 @@
#!/usr/bin/env python3
import hashlib
import os
import argparse
import urllib
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup as bs4
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument('url')
return parser
def get_extension(path):
return (path
.split('/')[-1]
.split('\\')[-1]
.split('.')[-1])
def relink_css(content, base_url, directory='styles'):
os.makedirs(directory, exist_ok=True)
for link in content.find_all('link', rel='stylesheet'):
href = urllib.parse.urljoin(base_url, link['href'], allow_fragments=False)
name = hashlib.sha1(href.encode()).hexdigest() + '.css'
path = os.path.join(directory, name)
if not os.path.exists(path):
content = urllib.request.urlopen(href).read()
with open(path, 'wb') as f:
f.write(content)
link['href'] = path
def relink_images(content, base_url, directory='images'):
os.makedirs(directory, exist_ok=True)
for image in content.find_all('img'):
src = urllib.parse.urljoin(base_url, image['src'], allow_fragments=False)
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
path = os.path.join(directory, name)
if not os.path.exists(path):
content = urllib.request.urlopen(src).read()
with open(path, 'wb') as f:
f.write(content)
image['src'] = path
def relink_scripts(content, base_url, directory='scripts'):
os.makedirs(directory, exist_ok=True)
for script in content.find_all('script'):
href = urllib.parse.urljoin(base_url, script['src'], allow_fragments=False)
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
path = os.path.join(directory, name)
if not os.path.exists(path):
content = urllib.request.urlopen(href).read()
with open(path, 'wb') as f:
f.write(content)
script['src'] = path
def relink_links(content, base_url):
for link in content.find_all('a'):
if 'href' not in link.attrs:
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
for link in content.find_all('link'):
if 'href' not in link.attrs or 'stylesheet' in link.attrs.get('rel', ''):
continue
full_href = urllib.parse.urljoin(base_url, link['href'])
link['href'] = full_href
def archive_to_dir(directory, url):
current_path = os.getcwd()
os.chdir(directory)
# Download file
base_file = urllib.request.urlopen(url).read()
archived_content = bs4(base_file, 'html5lib')
# Relink CSS
relink_css(archived_content, url)
# Relink scripts
relink_scripts(archived_content, url)
# Relink images
relink_images(archived_content, url)
# Relink ... links
relink_links(archived_content, url)
with open('index.html', 'wt') as f:
f.write(str(archived_content))
os.chdir(current_path)
def main():
args = get_parser().parse_args()
os.makedirs('test_ma', exist_ok=True)
archive_to_dir('test_ma', args.url)
if __name__ == '__main__':
main()

2
requirements.txt Normal file
View File

@ -0,0 +1,2 @@
bs4
html5lib