s
This commit is contained in:
parent
c6556a0534
commit
0c82b20e94
2 changed files with 37 additions and 19 deletions
14
archive.py
14
archive.py
|
|
@ -1,17 +1,25 @@
|
||||||
|
import logging
|
||||||
from waybackpy import WaybackMachineSaveAPI
|
from waybackpy import WaybackMachineSaveAPI
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from config import ARCHIVE_URLS, USER_AGENT
|
from config import ARCHIVE_URLS, USER_AGENT
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
def archive_url(url):
|
def archive_url(url):
|
||||||
print(f"🌐 Archiving {url} ...")
|
logger.info(f"🌐 Archiving {url} ...")
|
||||||
try:
|
try:
|
||||||
save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
|
save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
|
||||||
save_api.save()
|
save_api.save()
|
||||||
print(f"✅ Archived {url}")
|
logger.info(f"✅ Archived {url}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Exception archiving {url}: {e}")
|
logger.error(f"⚠️ Exception archiving {url}: {e}")
|
||||||
|
|
||||||
def archive_all_urls():
|
def archive_all_urls():
|
||||||
for url in ARCHIVE_URLS:
|
for url in ARCHIVE_URLS:
|
||||||
|
|
|
||||||
|
|
@ -4,23 +4,33 @@ from config import ZIP_URL, ZIP_FILENAME, HTML_FILENAME, XLSX_URL, XLSX_FILENAME
|
||||||
|
|
||||||
def download_zip_and_extract_html():
|
def download_zip_and_extract_html():
|
||||||
print("🔄 Downloading ZIP...")
|
print("🔄 Downloading ZIP...")
|
||||||
r = requests.get(ZIP_URL)
|
try:
|
||||||
|
with requests.get(ZIP_URL, timeout=30) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
with open(ZIP_FILENAME, "wb") as f:
|
with open(ZIP_FILENAME, "wb") as f:
|
||||||
f.write(r.content)
|
f.write(r.content)
|
||||||
print(f"✅ Saved ZIP as {ZIP_FILENAME}")
|
print(f"✅ Saved ZIP as {ZIP_FILENAME}")
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Failed to download ZIP: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
|
with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
|
||||||
with z.open(HTML_FILENAME) as html_file:
|
with z.open(HTML_FILENAME) as html_file:
|
||||||
html_content = html_file.read()
|
html_content = html_file.read()
|
||||||
with open(HTML_FILENAME, "wb") as f:
|
with open(HTML_FILENAME, "wb") as f:
|
||||||
f.write(html_content)
|
f.write(html_content)
|
||||||
print(f"✅ Extracted {HTML_FILENAME}")
|
print(f"✅ Extracted {HTML_FILENAME}")
|
||||||
|
except (zipfile.BadZipFile, KeyError) as e:
|
||||||
|
print(f"❌ Failed to extract {HTML_FILENAME}: {e}")
|
||||||
|
|
||||||
def download_xlsx():
|
def download_xlsx():
|
||||||
print("🔄 Downloading XLSX...")
|
print("🔄 Downloading XLSX...")
|
||||||
r = requests.get(XLSX_URL)
|
try:
|
||||||
|
with requests.get(XLSX_URL, timeout=30) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
with open(XLSX_FILENAME, "wb") as f:
|
with open(XLSX_FILENAME, "wb") as f:
|
||||||
f.write(r.content)
|
f.write(r.content)
|
||||||
print(f"✅ Saved XLSX as {XLSX_FILENAME}")
|
print(f"✅ Saved XLSX as {XLSX_FILENAME}")
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"❌ Failed to download XLSX: {e}")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue