rewrite
This commit is contained in:
parent
fb499a72c4
commit
4b708ba090
13 changed files with 272 additions and 141 deletions
394
main.py
394
main.py
|
|
@ -1,168 +1,288 @@
|
|||
import os
|
||||
import threading
|
||||
import time
|
||||
import requests
|
||||
import zipfile
|
||||
import threading
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
from bs4 import BeautifulSoup
|
||||
import csv
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from flask import Flask, send_file, render_template, send_from_directory
|
||||
from flask_cors import CORS
|
||||
from flask import Flask, send_file
|
||||
import os
|
||||
import json
|
||||
|
||||
app = Flask(__name__, template_folder="templates")
|
||||
CORS(app) # ✅ ENABLE CORS FOR ALL ROUTES
|
||||
app = Flask(__name__)
|
||||
|
||||
# Constants
|
||||
ZIP_URL = "https://docs.google.com/spreadsheets/d/1S6WwM05O277npQbaiNk-jZlXK3TdooSyWtqaWUvAI78/export?format=zip"
|
||||
XLSX_URL = "https://docs.google.com/spreadsheets/d/1S6WwM05O277npQbaiNk-jZlXK3TdooSyWtqaWUvAI78/export?format=xlsx"
|
||||
ZIP_FILE = "Trackerhub Reup.zip"
|
||||
XLSX_FILE = "artists.xlsx"
|
||||
EXTRACT_FOLDER = "sheet"
|
||||
HTML_FILE = os.path.join(EXTRACT_FOLDER, "Artists.html")
|
||||
CSV_FILE = "artists.csv"
|
||||
|
||||
# Artist names to exclude (no emojis, trimmed)
|
||||
EXCLUDE_ARTISTS = {
|
||||
"🤖 AI Models",
|
||||
"🤖 Lawson",
|
||||
"Comps & Edits",
|
||||
ZIP_FILENAME = "Trackerhub.zip"
|
||||
HTML_FILENAME = "Artists.html"
|
||||
CSV_FILENAME = "artists.csv"
|
||||
XLSX_FILENAME = "artists.xlsx"
|
||||
|
||||
exclude_names = {
|
||||
"AI Models",
|
||||
"Lawson",
|
||||
"BPM Tracker",
|
||||
"Worst Comps & Edits",
|
||||
"Yedits",
|
||||
"Allegations",
|
||||
"Rap Disses Timeline",
|
||||
"Underground Artists"
|
||||
"Underground Artists",
|
||||
}
|
||||
|
||||
def remove_emojis(text):
|
||||
emoji_pattern = re.compile(
|
||||
r'\s*['
|
||||
'\U0001F600-\U0001F64F'
|
||||
'\U0001F300-\U0001F5FF'
|
||||
'\U0001F680-\U0001F6FF'
|
||||
'\U0001F1E0-\U0001F1FF'
|
||||
'\u2702-\u27B0'
|
||||
'\u24C2-\U0001F251'
|
||||
']\s*',
|
||||
flags=re.UNICODE
|
||||
)
|
||||
cleaned_text = emoji_pattern.sub('', text)
|
||||
return cleaned_text.strip()
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
|
||||
|
||||
def fetch_and_process():
|
||||
# URLs to archive on changes — update these to your actual hosted domain
|
||||
BASE_URL = "http://localhost:5000" # Change this to your public domain when deployed
|
||||
|
||||
ARCHIVE_URLS = [
|
||||
f"{BASE_URL}/",
|
||||
f"{BASE_URL}/index.html/",
|
||||
f"{BASE_URL}/artists.html",
|
||||
f"{BASE_URL}/artists.csv",
|
||||
f"{BASE_URL}/artists.xlsx",
|
||||
]
|
||||
|
||||
DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL")
|
||||
|
||||
def clean_artist_name(text):
|
||||
return re.sub(r'[⭐🤖🎭\u2B50\uFE0F]', '', text).strip()
|
||||
|
||||
def force_star_flag(starred=True):
|
||||
return "Yes" if starred else "No"
|
||||
|
||||
def download_zip_and_extract_html():
|
||||
print("🔄 Downloading ZIP...")
|
||||
r = requests.get(ZIP_URL)
|
||||
r.raise_for_status()
|
||||
with open(ZIP_FILENAME, "wb") as f:
|
||||
f.write(r.content)
|
||||
print(f"✅ Saved ZIP as {ZIP_FILENAME}")
|
||||
|
||||
with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
|
||||
with z.open(HTML_FILENAME) as html_file:
|
||||
html_content = html_file.read()
|
||||
with open(HTML_FILENAME, "wb") as f:
|
||||
f.write(html_content)
|
||||
print(f"✅ Extracted {HTML_FILENAME}")
|
||||
|
||||
def download_xlsx():
|
||||
print("🔄 Downloading XLSX...")
|
||||
r = requests.get(XLSX_URL)
|
||||
r.raise_for_status()
|
||||
with open(XLSX_FILENAME, "wb") as f:
|
||||
f.write(r.content)
|
||||
print(f"✅ Saved XLSX as {XLSX_FILENAME}")
|
||||
|
||||
def generate_csv():
|
||||
print("📝 Generating CSV...")
|
||||
with open(HTML_FILENAME, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
rows = soup.select("table.waffle tbody tr")[3:] # skip headers and Discord
|
||||
|
||||
data = []
|
||||
starring = True
|
||||
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 4:
|
||||
continue
|
||||
|
||||
link_tag = cells[0].find("a")
|
||||
artist_name_raw = link_tag.get_text(strip=True) if link_tag else cells[0].get_text(strip=True)
|
||||
artist_url = link_tag["href"] if link_tag else ""
|
||||
if not artist_url:
|
||||
continue
|
||||
|
||||
if "AI Models" in artist_name_raw:
|
||||
starring = False
|
||||
|
||||
artist_name_clean = clean_artist_name(artist_name_raw)
|
||||
if artist_name_clean in exclude_names:
|
||||
continue
|
||||
|
||||
if "🚩" in artist_name_raw:
|
||||
continue
|
||||
|
||||
best = force_star_flag(starring)
|
||||
credit = cells[1].get_text(strip=True)
|
||||
updated = cells[2].get_text(strip=True)
|
||||
links_work = cells[3].get_text(strip=True)
|
||||
|
||||
data.append([artist_name_clean, artist_url, credit, links_work, updated, best])
|
||||
|
||||
with open(CSV_FILENAME, "w", newline='', encoding="utf-8") as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(["Artist Name", "URL", "Credit", "Links Work", "Updated", "Best"])
|
||||
writer.writerows(data)
|
||||
|
||||
print(f"✅ CSV saved as {CSV_FILENAME}")
|
||||
|
||||
def hash_file(filename):
|
||||
hasher = hashlib.sha256()
|
||||
with open(filename, "rb") as f:
|
||||
buf = f.read()
|
||||
hasher.update(buf)
|
||||
return hasher.hexdigest()
|
||||
|
||||
def archive_url(url):
|
||||
print(f"🌐 Archiving {url} ...")
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
try:
|
||||
print("[*] Downloading ZIP...")
|
||||
r = requests.get(ZIP_URL)
|
||||
with open(ZIP_FILE, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
print("[*] Extracting ZIP...")
|
||||
with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
|
||||
zip_ref.extractall(EXTRACT_FOLDER)
|
||||
|
||||
print("[*] Downloading XLSX...")
|
||||
r = requests.get(XLSX_URL)
|
||||
with open(XLSX_FILE, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
print("[*] Parsing HTML...")
|
||||
with open(HTML_FILE, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f, "html.parser")
|
||||
|
||||
table = soup.find("table", class_="waffle")
|
||||
if not table:
|
||||
print("[!] Table not found.")
|
||||
return
|
||||
|
||||
rows = table.find_all("tr")[4:]
|
||||
|
||||
data = []
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
if len(cols) < 4:
|
||||
continue
|
||||
|
||||
artist_cell = cols[0]
|
||||
a_tag = artist_cell.find("a")
|
||||
artist_name_raw = a_tag.text.strip() if a_tag else artist_cell.text.strip()
|
||||
artist_name_clean = remove_emojis(artist_name_raw.replace('"', '')).strip()
|
||||
|
||||
if artist_name_clean in EXCLUDE_ARTISTS:
|
||||
continue
|
||||
|
||||
artist_url = a_tag['href'] if a_tag and a_tag.has_attr('href') else ""
|
||||
credits = cols[1].get_text(strip=True)
|
||||
updated = cols[2].get_text(strip=True)
|
||||
links_work = cols[3].get_text(strip=True)
|
||||
|
||||
cleaned_row = [
|
||||
artist_name_clean,
|
||||
remove_emojis(artist_url.replace('"', '')),
|
||||
remove_emojis(credits.replace('"', '')),
|
||||
remove_emojis(updated.replace('"', '')),
|
||||
remove_emojis(links_work.replace('"', ''))
|
||||
]
|
||||
|
||||
if all(cell for cell in cleaned_row):
|
||||
data.append(cleaned_row)
|
||||
|
||||
data.sort(key=lambda row: row[0].lower())
|
||||
|
||||
print(f"[*] Writing {len(data)} rows to CSV...")
|
||||
with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["artist name", "URL", "credits", "updated", "links work"])
|
||||
writer.writerows(data)
|
||||
|
||||
print("[✓] Done! CSV and XLSX updated.")
|
||||
|
||||
resp = requests.get(f"https://web.archive.org/save/{url}", headers=headers, timeout=30)
|
||||
if resp.status_code == 200:
|
||||
print(f"✅ Archived {url}")
|
||||
else:
|
||||
print(f"⚠️ Failed to archive {url}, status code {resp.status_code}")
|
||||
except Exception as e:
|
||||
print(f"[!] Error: {e}")
|
||||
print(f"⚠️ Exception archiving {url}: {e}")
|
||||
|
||||
def archive_all_urls():
|
||||
for url in ARCHIVE_URLS:
|
||||
delay = 10 + random.uniform(-3, 3)
|
||||
time.sleep(delay)
|
||||
archive_url(url)
|
||||
|
||||
def read_csv_to_dict(filename):
|
||||
"""Read CSV into dict with artist_name as key, storing relevant fields."""
|
||||
d = {}
|
||||
with open(filename, newline='', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
d[row["Artist Name"]] = row
|
||||
return d
|
||||
|
||||
def detect_changes(old_data, new_data):
|
||||
"""
|
||||
Compare old and new data dictionaries.
|
||||
Returns a list of strings describing changes.
|
||||
"""
|
||||
changes = []
|
||||
|
||||
old_keys = set(old_data.keys())
|
||||
new_keys = set(new_data.keys())
|
||||
|
||||
removed = old_keys - new_keys
|
||||
added = new_keys - old_keys
|
||||
common = old_keys & new_keys
|
||||
|
||||
for artist in removed:
|
||||
changes.append(f"❌ Removed: **{artist}**")
|
||||
|
||||
for artist in added:
|
||||
changes.append(f"➕ Added: **{artist}**")
|
||||
|
||||
for artist in common:
|
||||
old_row = old_data[artist]
|
||||
new_row = new_data[artist]
|
||||
# Check if URL changed
|
||||
if old_row["URL"] != new_row["URL"]:
|
||||
changes.append(f"🔗 Link changed for **{artist}**")
|
||||
# Check other fields if needed (Credit, Updated, etc.)
|
||||
if old_row["Credit"] != new_row["Credit"]:
|
||||
changes.append(f"✏️ Credit changed for **{artist}**")
|
||||
if old_row["Links Work"] != new_row["Links Work"]:
|
||||
changes.append(f"🔄 Links Work status changed for **{artist}**")
|
||||
if old_row["Updated"] != new_row["Updated"]:
|
||||
changes.append(f"🕒 Updated date changed for **{artist}**")
|
||||
if old_row["Best"] != new_row["Best"]:
|
||||
changes.append(f"⭐ Best flag changed for **{artist}**")
|
||||
|
||||
return changes
|
||||
|
||||
def send_discord_message(content):
|
||||
if not DISCORD_WEBHOOK_URL:
|
||||
print("⚠️ Discord webhook URL not set in env")
|
||||
return
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
data = {"content": content}
|
||||
|
||||
try:
|
||||
resp = requests.post(DISCORD_WEBHOOK_URL, headers=headers, data=json.dumps(data), timeout=10)
|
||||
if resp.status_code in (200, 204):
|
||||
print("✅ Discord notification sent")
|
||||
else:
|
||||
print(f"⚠️ Failed to send Discord notification, status code {resp.status_code}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Exception sending Discord notification: {e}")
|
||||
|
||||
def update_loop():
|
||||
last_csv_hash = None
|
||||
last_csv_data = {}
|
||||
|
||||
def background_updater():
|
||||
while True:
|
||||
fetch_and_process()
|
||||
time.sleep(600)
|
||||
try:
|
||||
download_zip_and_extract_html()
|
||||
download_xlsx()
|
||||
generate_csv()
|
||||
|
||||
# Routes
|
||||
@app.route("/")
|
||||
@app.route("/index")
|
||||
@app.route("/index.html")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
current_hash = hash_file(CSV_FILENAME)
|
||||
current_data = read_csv_to_dict(CSV_FILENAME)
|
||||
|
||||
@app.route('/favicon.png')
|
||||
def serve_favicon():
|
||||
return send_from_directory(app.template_folder, 'favicon.png')
|
||||
if last_csv_hash is None:
|
||||
print("ℹ️ Initial CSV hash stored.")
|
||||
elif current_hash != last_csv_hash:
|
||||
print("🔔 CSV has changed! Archiving URLs...")
|
||||
|
||||
@app.route("/artists.csv")
|
||||
def serve_csv():
|
||||
if os.path.exists(CSV_FILE):
|
||||
return send_file(CSV_FILE, mimetype="text/csv", as_attachment=False)
|
||||
return "CSV not ready yet.", 503
|
||||
changes = detect_changes(last_csv_data, current_data)
|
||||
if changes:
|
||||
message = "**CSV Update Detected:**\n" + "\n".join(changes)
|
||||
send_discord_message(message)
|
||||
else:
|
||||
print("ℹ️ No detectable content changes found.")
|
||||
|
||||
@app.route("/artists.xlsx")
|
||||
def serve_xlsx():
|
||||
if os.path.exists(XLSX_FILE):
|
||||
return send_file(XLSX_FILE, mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", as_attachment=False)
|
||||
return "XLSX not ready yet.", 503
|
||||
archive_all_urls()
|
||||
else:
|
||||
print("ℹ️ CSV unchanged. No archiving needed.")
|
||||
|
||||
last_csv_hash = current_hash
|
||||
last_csv_data = current_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"⚠️ Error updating files: {e}")
|
||||
|
||||
time.sleep(600) # 10 minutes
|
||||
|
||||
@app.route("/artists.html")
|
||||
def serve_artists_html():
|
||||
if os.path.exists(HTML_FILE):
|
||||
return send_file(HTML_FILE, mimetype="text/html")
|
||||
return "HTML file not found.", 404
|
||||
return send_file(HTML_FILENAME, mimetype="text/html")
|
||||
|
||||
@app.route("/<path:path>")
|
||||
def catch_all(path):
|
||||
if os.path.exists(CSV_FILE):
|
||||
return send_file(CSV_FILE, mimetype="text/csv", as_attachment=False)
|
||||
return "CSV not ready yet.", 503
|
||||
@app.route("/artists.csv")
|
||||
def serve_artists_csv():
|
||||
return send_file(CSV_FILENAME, mimetype="text/csv")
|
||||
|
||||
@app.route('/_next/<path:filename>')
|
||||
def serve_next(filename):
|
||||
return send_from_directory(os.path.join(app.template_folder, '_next'), filename)
|
||||
@app.route("/artists.xlsx")
|
||||
def serve_artists_xlsx():
|
||||
return send_file(XLSX_FILENAME, mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
||||
|
||||
@app.route("/")
|
||||
@app.route("/index.html/")
|
||||
def serve_index():
|
||||
# Simple index page linking to your files
|
||||
return """
|
||||
<html>
|
||||
<head><title>Artists Data</title></head>
|
||||
<body>
|
||||
<h1>Artists Data</h1>
|
||||
<ul>
|
||||
<li><a href="/artists.html">Artists.html</a></li>
|
||||
<li><a href="/artists.csv">artists.csv</a></li>
|
||||
<li><a href="/artists.xlsx">artists.xlsx</a></li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
thread = threading.Thread(target=background_updater, daemon=True)
|
||||
thread.start()
|
||||
threading.Thread(target=update_loop, daemon=True).start()
|
||||
try:
|
||||
download_zip_and_extract_html()
|
||||
download_xlsx()
|
||||
generate_csv()
|
||||
except Exception as e:
|
||||
print(f"⚠️ Initial update failed: {e}")
|
||||
|
||||
app.run(host="0.0.0.0", port=5000)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue