Use a custom user-agent.
This commit is contained in:
parent
015b4da5e9
commit
36b375d9d4
20
macli.py
20
macli.py
@ -8,6 +8,8 @@ import urllib.request
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
from bs4 import BeautifulSoup as bs4
|
from bs4 import BeautifulSoup as bs4
|
||||||
|
|
||||||
|
USER_AGENT = 'miniarchiver bot'
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@ -22,6 +24,16 @@ def get_extension(path):
|
|||||||
.split('.')[-1])
|
.split('.')[-1])
|
||||||
|
|
||||||
|
|
||||||
|
def request(url):
|
||||||
|
req = urllib.request.Request(
|
||||||
|
url,
|
||||||
|
data=None,
|
||||||
|
headers={
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return urllib.request.urlopen(req)
|
||||||
|
|
||||||
def relink_css(content, base_url, directory='styles'):
|
def relink_css(content, base_url, directory='styles'):
|
||||||
os.makedirs(directory, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
for link in content.find_all('link', rel='stylesheet'):
|
for link in content.find_all('link', rel='stylesheet'):
|
||||||
@ -30,7 +42,7 @@ def relink_css(content, base_url, directory='styles'):
|
|||||||
name = hashlib.sha1(href.encode()).hexdigest() + '.css'
|
name = hashlib.sha1(href.encode()).hexdigest() + '.css'
|
||||||
path = os.path.join(directory, name)
|
path = os.path.join(directory, name)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
content = urllib.request.urlopen(href).read()
|
content = request(href).read()
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
@ -45,7 +57,7 @@ def relink_images(content, base_url, directory='images'):
|
|||||||
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
|
name = hashlib.sha1(src.encode()).hexdigest() + '.' + get_extension(src)
|
||||||
path = os.path.join(directory, name)
|
path = os.path.join(directory, name)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
content = urllib.request.urlopen(src).read()
|
content = request(src).read()
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
@ -60,7 +72,7 @@ def relink_scripts(content, base_url, directory='scripts'):
|
|||||||
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
|
name = hashlib.sha1(href.encode()).hexdigest() + '.' + get_extension(href)
|
||||||
path = os.path.join(directory, name)
|
path = os.path.join(directory, name)
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
content = urllib.request.urlopen(href).read()
|
content = request(href).read()
|
||||||
with open(path, 'wb') as f:
|
with open(path, 'wb') as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
@ -87,7 +99,7 @@ def archive_to_dir(directory, url):
|
|||||||
current_path = os.getcwd()
|
current_path = os.getcwd()
|
||||||
os.chdir(directory)
|
os.chdir(directory)
|
||||||
# Download file
|
# Download file
|
||||||
base_file = urllib.request.urlopen(url).read()
|
base_file = request(url).read()
|
||||||
archived_content = bs4(base_file, 'html5lib')
|
archived_content = bs4(base_file, 'html5lib')
|
||||||
|
|
||||||
# Relink CSS
|
# Relink CSS
|
||||||
|
Loading…
Reference in New Issue
Block a user