This commit is contained in:
Eduard Prigoana 2025-07-22 10:22:51 +03:00
parent c6556a0534
commit 0c82b20e94
2 changed files with 37 additions and 19 deletions

View file

@ -1,17 +1,25 @@
import logging
from waybackpy import WaybackMachineSaveAPI from waybackpy import WaybackMachineSaveAPI
import time import time
import random import random
from config import ARCHIVE_URLS, USER_AGENT from config import ARCHIVE_URLS, USER_AGENT
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def archive_url(url): def archive_url(url):
print(f"🌐 Archiving {url} ...") logger.info(f"🌐 Archiving {url} ...")
try: try:
save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT) save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
save_api.save() save_api.save()
print(f"✅ Archived {url}") logger.info(f"✅ Archived {url}")
except Exception as e: except Exception as e:
print(f"⚠️ Exception archiving {url}: {e}") logger.error(f"⚠️ Exception archiving {url}: {e}")
def archive_all_urls(): def archive_all_urls():
for url in ARCHIVE_URLS: for url in ARCHIVE_URLS:

View file

@ -4,23 +4,33 @@ from config import ZIP_URL, ZIP_FILENAME, HTML_FILENAME, XLSX_URL, XLSX_FILENAME
def download_zip_and_extract_html(): def download_zip_and_extract_html():
print("🔄 Downloading ZIP...") print("🔄 Downloading ZIP...")
r = requests.get(ZIP_URL) try:
r.raise_for_status() with requests.get(ZIP_URL, timeout=30) as r:
with open(ZIP_FILENAME, "wb") as f: r.raise_for_status()
f.write(r.content) with open(ZIP_FILENAME, "wb") as f:
print(f"✅ Saved ZIP as {ZIP_FILENAME}") f.write(r.content)
print(f"✅ Saved ZIP as {ZIP_FILENAME}")
except requests.RequestException as e:
print(f"❌ Failed to download ZIP: {e}")
return
with zipfile.ZipFile(ZIP_FILENAME, "r") as z: try:
with z.open(HTML_FILENAME) as html_file: with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
html_content = html_file.read() with z.open(HTML_FILENAME) as html_file:
with open(HTML_FILENAME, "wb") as f: html_content = html_file.read()
f.write(html_content) with open(HTML_FILENAME, "wb") as f:
print(f"✅ Extracted {HTML_FILENAME}") f.write(html_content)
print(f"✅ Extracted {HTML_FILENAME}")
except (zipfile.BadZipFile, KeyError) as e:
print(f"❌ Failed to extract {HTML_FILENAME}: {e}")
def download_xlsx(): def download_xlsx():
print("🔄 Downloading XLSX...") print("🔄 Downloading XLSX...")
r = requests.get(XLSX_URL) try:
r.raise_for_status() with requests.get(XLSX_URL, timeout=30) as r:
with open(XLSX_FILENAME, "wb") as f: r.raise_for_status()
f.write(r.content) with open(XLSX_FILENAME, "wb") as f:
print(f"✅ Saved XLSX as {XLSX_FILENAME}") f.write(r.content)
print(f"✅ Saved XLSX as {XLSX_FILENAME}")
except requests.RequestException as e:
print(f"❌ Failed to download XLSX: {e}")