formatting + waybackpy

This commit is contained in:
Eduard Prigoana 2025-07-22 07:03:24 +03:00
parent 9552369760
commit d33ced1214
6 changed files with 14 additions and 16 deletions

Binary file not shown.

Binary file not shown.

View file

@ -1,16 +1,15 @@
import requests, time, random from waybackpy import WaybackMachineSaveAPI
import time
import random
from config import ARCHIVE_URLS, USER_AGENT from config import ARCHIVE_URLS, USER_AGENT
def archive_url(url): def archive_url(url):
print(f"🌐 Archiving {url} ...") print(f"🌐 Archiving {url} ...")
headers = {"User-Agent": USER_AGENT}
try: try:
resp = requests.get(f"https://web.archive.org/save/{url}", headers=headers, timeout=30) save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
if resp.status_code == 200: save_api.save()
print(f"✅ Archived {url}") print(f"✅ Archived {url}")
else:
print(f"⚠️ Failed to archive {url}, status code {resp.status_code}")
except Exception as e: except Exception as e:
print(f"⚠️ Exception archiving {url}: {e}") print(f"⚠️ Exception archiving {url}: {e}")
@ -19,3 +18,7 @@ def archive_all_urls():
delay = 10 + random.uniform(-3, 3) delay = 10 + random.uniform(-3, 3)
time.sleep(delay) time.sleep(delay)
archive_url(url) archive_url(url)
def test_archive():
test_url = "https://httpbin.org/anything/foo/bar"
archive_url(test_url)

View file

@ -20,7 +20,7 @@ exclude_names = {
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36" USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
BASE_URL = "http://localhost:5000" BASE_URL = "https://artistgrid.cx/"
ARCHIVE_URLS = [ ARCHIVE_URLS = [
f"{BASE_URL}/", f"{BASE_URL}/",

View file

@ -4,6 +4,7 @@ import threading
from config import HTML_FILENAME, CSV_FILENAME, XLSX_FILENAME from config import HTML_FILENAME, CSV_FILENAME, XLSX_FILENAME
from update_loop import update_loop from update_loop import update_loop
from archive import test_archive
app = Flask(__name__) app = Flask(__name__)
CORS(app) CORS(app)
@ -39,11 +40,4 @@ if __name__ == "__main__":
from downloader import download_zip_and_extract_html, download_xlsx from downloader import download_zip_and_extract_html, download_xlsx
from parser import generate_csv from parser import generate_csv
try:
download_zip_and_extract_html()
download_xlsx()
generate_csv()
except Exception as e:
print(f"⚠️ Initial update failed: {e}")
app.run(host="0.0.0.0", port=5000) app.run(host="0.0.0.0", port=5000)

View file

@ -2,4 +2,5 @@ Flask
requests requests
beautifulsoup4 beautifulsoup4
lxml lxml
flask-cors flask-cors
waybacKpy