formatting + waybackpy
This commit is contained in:
parent
9552369760
commit
d33ced1214
6 changed files with 14 additions and 16 deletions
Binary file not shown.
Binary file not shown.
15
archive.py
15
archive.py
|
|
@ -1,16 +1,15 @@
|
||||||
import requests, time, random
|
from waybackpy import WaybackMachineSaveAPI
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
from config import ARCHIVE_URLS, USER_AGENT
|
from config import ARCHIVE_URLS, USER_AGENT
|
||||||
|
|
||||||
def archive_url(url):
|
def archive_url(url):
|
||||||
print(f"🌐 Archiving {url} ...")
|
print(f"🌐 Archiving {url} ...")
|
||||||
headers = {"User-Agent": USER_AGENT}
|
|
||||||
try:
|
try:
|
||||||
resp = requests.get(f"https://web.archive.org/save/{url}", headers=headers, timeout=30)
|
save_api = WaybackMachineSaveAPI(url, user_agent=USER_AGENT)
|
||||||
if resp.status_code == 200:
|
save_api.save()
|
||||||
print(f"✅ Archived {url}")
|
print(f"✅ Archived {url}")
|
||||||
else:
|
|
||||||
print(f"⚠️ Failed to archive {url}, status code {resp.status_code}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Exception archiving {url}: {e}")
|
print(f"⚠️ Exception archiving {url}: {e}")
|
||||||
|
|
||||||
|
|
@ -19,3 +18,7 @@ def archive_all_urls():
|
||||||
delay = 10 + random.uniform(-3, 3)
|
delay = 10 + random.uniform(-3, 3)
|
||||||
time.sleep(delay)
|
time.sleep(delay)
|
||||||
archive_url(url)
|
archive_url(url)
|
||||||
|
|
||||||
|
def test_archive():
|
||||||
|
test_url = "https://httpbin.org/anything/foo/bar"
|
||||||
|
archive_url(test_url)
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ exclude_names = {
|
||||||
|
|
||||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
|
||||||
|
|
||||||
BASE_URL = "http://localhost:5000"
|
BASE_URL = "https://artistgrid.cx/"
|
||||||
|
|
||||||
ARCHIVE_URLS = [
|
ARCHIVE_URLS = [
|
||||||
f"{BASE_URL}/",
|
f"{BASE_URL}/",
|
||||||
|
|
|
||||||
8
main.py
8
main.py
|
|
@ -4,6 +4,7 @@ import threading
|
||||||
|
|
||||||
from config import HTML_FILENAME, CSV_FILENAME, XLSX_FILENAME
|
from config import HTML_FILENAME, CSV_FILENAME, XLSX_FILENAME
|
||||||
from update_loop import update_loop
|
from update_loop import update_loop
|
||||||
|
from archive import test_archive
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
CORS(app)
|
CORS(app)
|
||||||
|
|
@ -39,11 +40,4 @@ if __name__ == "__main__":
|
||||||
from downloader import download_zip_and_extract_html, download_xlsx
|
from downloader import download_zip_and_extract_html, download_xlsx
|
||||||
from parser import generate_csv
|
from parser import generate_csv
|
||||||
|
|
||||||
try:
|
|
||||||
download_zip_and_extract_html()
|
|
||||||
download_xlsx()
|
|
||||||
generate_csv()
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ Initial update failed: {e}")
|
|
||||||
|
|
||||||
app.run(host="0.0.0.0", port=5000)
|
app.run(host="0.0.0.0", port=5000)
|
||||||
|
|
|
||||||
|
|
@ -3,3 +3,4 @@ requests
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
lxml
|
lxml
|
||||||
flask-cors
|
flask-cors
|
||||||
|
waybacKpy
|
||||||
Loading…
Add table
Add a link
Reference in a new issue