Use folders to store archives of different sites.

This commit is contained in:
kenkeiras 2017-07-05 00:55:33 +02:00
parent a2bc995885
commit 42869cf410
2 changed files with 22 additions and 3 deletions

2
.gitignore vendored
View File

@ -15,4 +15,4 @@ dist/
*.egg-info/ *.egg-info/
# Directories for testing # Directories for testing
test_ma archive

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import re
import hashlib import hashlib
import os import os
import argparse import argparse
@ -9,6 +10,9 @@ import urllib.parse
from bs4 import BeautifulSoup as bs4 from bs4 import BeautifulSoup as bs4
USER_AGENT = 'miniarchiver bot' USER_AGENT = 'miniarchiver bot'
ARCHIVE_ROOT = 'archive'
DEFAULT_NAME = 'archived_web'
ALLOWED_NAMES_RE = re.compile(r'^[- .,a-zA-Z0-9]+$')
OBJECT_TYPE_DESCRIPTORS = ( OBJECT_TYPE_DESCRIPTORS = (
@ -48,6 +52,8 @@ OBJECT_TYPE_DESCRIPTORS = (
def get_parser(): def get_parser():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('url') parser.add_argument('url')
parser.add_argument('--name', '-n', default=DEFAULT_NAME)
parser.add_argument('--force', '-f', action='store_true')
return parser return parser
@ -140,8 +146,21 @@ def archive_to_dir(directory, url):
def main(): def main():
args = get_parser().parse_args() args = get_parser().parse_args()
os.makedirs('test_ma', exist_ok=True)
archive_to_dir('test_ma', args.url) path = os.path.join(ARCHIVE_ROOT, args.name)
if not ALLOWED_NAMES_RE.match(args.name):
print(("Only characters 'a-zA-Z0-9', spaces, dots, commas and dashes"
"are allowed as names."))
return
if os.path.exists(path) and not args.force:
print(("Archive “{}” already exists, set a new name with '-n <name>'"
" or force a overwrite with '-f")
.format(args.name))
return
os.makedirs(path, exist_ok=True)
archive_to_dir(path, args.url)
if __name__ == '__main__': if __name__ == '__main__':