Use a custom user-agent.

This commit is contained in:
kenkeiras 2017-07-05 00:14:47 +02:00
parent 015b4da5e9
commit 36b375d9d4

View File

@ -8,6 +8,8 @@ import urllib.request
import urllib.parse import urllib.parse
from bs4 import BeautifulSoup as bs4 from bs4 import BeautifulSoup as bs4
USER_AGENT = 'miniarchiver bot'
def get_parser(): def get_parser():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -22,6 +24,16 @@ def get_extension(path):
.split('.')[-1]) .split('.')[-1])
def request(url):
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': USER_AGENT,
}
)
return urllib.request.urlopen(req)
def relink_css(content, base_url, directory='styles'): def relink_css(content, base_url, directory='styles'):
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
for link in content.find_all('link', rel='stylesheet'): for link in content.find_all('link', rel='stylesheet'):
@ -30,7 +42,7 @@ def relink_css(content, base_url, directory='styles'):
name = hashlib.sha1(href.encode()).hexdigest() + '.css' name = hashlib.sha1(href.encode()).hexdigest() + '.css'
path = os.path.join(directory, name) path = os.path.join(directory, name)
if not os.path.exists(path): if not os.path.exists(path):
content = urllib.request.urlopen(href).read() content = request(href).read()
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(content) f.write(content)
@ -45,7 +57,7 @@ def relink_images(content, base_url, directory='images'):
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src) name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
path = os.path.join(directory, name) path = os.path.join(directory, name)
if not os.path.exists(path): if not os.path.exists(path):
content = urllib.request.urlopen(src).read() content = request(src).read()
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(content) f.write(content)
@ -60,7 +72,7 @@ def relink_scripts(content, base_url, directory='scripts'):
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href) name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
path = os.path.join(directory, name) path = os.path.join(directory, name)
if not os.path.exists(path): if not os.path.exists(path):
content = urllib.request.urlopen(href).read() content = request(href).read()
with open(path, 'wb') as f: with open(path, 'wb') as f:
f.write(content) f.write(content)
@ -87,7 +99,7 @@ def archive_to_dir(directory, url):
current_path = os.getcwd() current_path = os.getcwd()
os.chdir(directory) os.chdir(directory)
# Download file # Download file
base_file = urllib.request.urlopen(url).read() base_file = request(url).read()
archived_content = bs4(base_file, 'html5lib') archived_content = bs4(base_file, 'html5lib')
# Relink CSS # Relink CSS