s
This commit is contained in:
parent
c6556a0534
commit
0c82b20e94
2 changed files with 37 additions and 19 deletions
14
archive.py
14
archive.py
|
|
@ -1,17 +1,25 @@
|
|||
import logging
|
||||
from waybackpy import WaybackMachineSaveAPI
|
||||
import time
|
||||
import random
|
||||
|
||||
from config import ARCHIVE_URLS, USER_AGENT
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def archive_url(url):
|
||||
print(f"🌐 Archiving {url} ...")
|
||||
logger.info(f"🌐 Archiving {url} ...")
|
||||
try:
|
||||
save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
|
||||
save_api.save()
|
||||
print(f"✅ Archived {url}")
|
||||
logger.info(f"✅ Archived {url}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Exception archiving {url}: {e}")
|
||||
logger.error(f"⚠️ Exception archiving {url}: {e}")
|
||||
|
||||
def archive_all_urls():
|
||||
for url in ARCHIVE_URLS:
|
||||
|
|
|
|||
|
|
@ -4,23 +4,33 @@ from config import ZIP_URL, ZIP_FILENAME, HTML_FILENAME, XLSX_URL, XLSX_FILENAME
|
|||
|
||||
def download_zip_and_extract_html():
|
||||
print("🔄 Downloading ZIP...")
|
||||
r = requests.get(ZIP_URL)
|
||||
try:
|
||||
with requests.get(ZIP_URL, timeout=30) as r:
|
||||
r.raise_for_status()
|
||||
with open(ZIP_FILENAME, "wb") as f:
|
||||
f.write(r.content)
|
||||
print(f"✅ Saved ZIP as {ZIP_FILENAME}")
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Failed to download ZIP: {e}")
|
||||
return
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
|
||||
with z.open(HTML_FILENAME) as html_file:
|
||||
html_content = html_file.read()
|
||||
with open(HTML_FILENAME, "wb") as f:
|
||||
f.write(html_content)
|
||||
print(f"✅ Extracted {HTML_FILENAME}")
|
||||
except (zipfile.BadZipFile, KeyError) as e:
|
||||
print(f"❌ Failed to extract {HTML_FILENAME}: {e}")
|
||||
|
||||
def download_xlsx():
|
||||
print("🔄 Downloading XLSX...")
|
||||
r = requests.get(XLSX_URL)
|
||||
try:
|
||||
with requests.get(XLSX_URL, timeout=30) as r:
|
||||
r.raise_for_status()
|
||||
with open(XLSX_FILENAME, "wb") as f:
|
||||
f.write(r.content)
|
||||
print(f"✅ Saved XLSX as {XLSX_FILENAME}")
|
||||
except requests.RequestException as e:
|
||||
print(f"❌ Failed to download XLSX: {e}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue