diff --git a/archive.py b/archive.py index 1116e79..d3989a9 100644 --- a/archive.py +++ b/archive.py @@ -1,17 +1,25 @@ +import logging from waybackpy import WaybackMachineSaveAPI import time import random from config import ARCHIVE_URLS, USER_AGENT +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + def archive_url(url): - print(f"🌐 Archiving {url} ...") + logger.info(f"🌐 Archiving {url} ...") try: save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT) save_api.save() - print(f"✅ Archived {url}") + logger.info(f"✅ Archived {url}") except Exception as e: - print(f"⚠️ Exception archiving {url}: {e}") + logger.error(f"⚠️ Exception archiving {url}: {e}") def archive_all_urls(): for url in ARCHIVE_URLS: diff --git a/downloader.py b/downloader.py index d6c69fd..94f1470 100644 --- a/downloader.py +++ b/downloader.py @@ -4,23 +4,33 @@ from config import ZIP_URL, ZIP_FILENAME, HTML_FILENAME, XLSX_URL, XLSX_FILENAME def download_zip_and_extract_html(): print("🔄 Downloading ZIP...") - r = requests.get(ZIP_URL) - r.raise_for_status() - with open(ZIP_FILENAME, "wb") as f: - f.write(r.content) - print(f"✅ Saved ZIP as {ZIP_FILENAME}") + try: + with requests.get(ZIP_URL, timeout=30) as r: + r.raise_for_status() + with open(ZIP_FILENAME, "wb") as f: + f.write(r.content) + print(f"✅ Saved ZIP as {ZIP_FILENAME}") + except requests.RequestException as e: + print(f"❌ Failed to download ZIP: {e}") + return - with zipfile.ZipFile(ZIP_FILENAME, "r") as z: - with z.open(HTML_FILENAME) as html_file: - html_content = html_file.read() - with open(HTML_FILENAME, "wb") as f: - f.write(html_content) - print(f"✅ Extracted {HTML_FILENAME}") + try: + with zipfile.ZipFile(ZIP_FILENAME, "r") as z: + with z.open(HTML_FILENAME) as html_file: + html_content = html_file.read() + with open(HTML_FILENAME, "wb") as f: + f.write(html_content) + print(f"✅ Extracted {HTML_FILENAME}") + except (zipfile.BadZipFile, KeyError) as e: + print(f"❌ Failed to extract {HTML_FILENAME}: {e}") def download_xlsx(): print("🔄 Downloading XLSX...") - r = requests.get(XLSX_URL) - r.raise_for_status() - with open(XLSX_FILENAME, "wb") as f: - f.write(r.content) - print(f"✅ Saved XLSX as {XLSX_FILENAME}") + try: + with requests.get(XLSX_URL, timeout=30) as r: + r.raise_for_status() + with open(XLSX_FILENAME, "wb") as f: + f.write(r.content) + print(f"✅ Saved XLSX as {XLSX_FILENAME}") + except requests.RequestException as e: + print(f"❌ Failed to download XLSX: {e}")