diff --git a/macli.py b/macli.py index cbeb374..f0a7c9e 100644 --- a/macli.py +++ b/macli.py @@ -8,6 +8,8 @@ import urllib.request import urllib.parse from bs4 import BeautifulSoup as bs4 +USER_AGENT = 'miniarchiver bot' + def get_parser(): parser = argparse.ArgumentParser() @@ -22,6 +24,16 @@ def get_extension(path): .split('.')[-1]) +def request(url): + req = urllib.request.Request( + url, + data=None, + headers={ + 'User-Agent': USER_AGENT, + } + ) + return urllib.request.urlopen(req) + def relink_css(content, base_url, directory='styles'): os.makedirs(directory, exist_ok=True) for link in content.find_all('link', rel='stylesheet'): @@ -30,7 +42,7 @@ def relink_css(content, base_url, directory='styles'): name = hashlib.sha1(href.encode()).hexdigest() + '.css' path = os.path.join(directory, name) if not os.path.exists(path): - content = urllib.request.urlopen(href).read() + content = request(href).read() with open(path, 'wb') as f: f.write(content) @@ -45,7 +57,7 @@ def relink_images(content, base_url, directory='images'): name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src) path = os.path.join(directory, name) if not os.path.exists(path): - content = urllib.request.urlopen(src).read() + content = request(src).read() with open(path, 'wb') as f: f.write(content) @@ -60,7 +72,7 @@ def relink_scripts(content, base_url, directory='scripts'): name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href) path = os.path.join(directory, name) if not os.path.exists(path): - content = urllib.request.urlopen(href).read() + content = request(href).read() with open(path, 'wb') as f: f.write(content) @@ -87,7 +99,7 @@ def archive_to_dir(directory, url): current_path = os.getcwd() os.chdir(directory) # Download file - base_file = urllib.request.urlopen(url).read() + base_file = request(url).read() archived_content = bs4(base_file, 'html5lib') # Relink CSS