rewrite

2025-07-20 18:23:49 +03:00 · 2025-07-20 18:23:49 +03:00 · 4b708ba090
commit 4b708ba090
parent fb499a72c4
13 changed files with 272 additions and 141 deletions
--- a/main.py
+++ b/main.py
@ -1,168 +1,288 @@
-import os
-import threading
-import time
 import requests
 import zipfile
+import threading
+import time
+import random
+import hashlib
+from bs4 import BeautifulSoup
 import csv
 import re
-from bs4 import BeautifulSoup
-from flask import Flask, send_file, render_template, send_from_directory
-from flask_cors import CORS
+from flask import Flask, send_file
+import os
+import json

-app = Flask(__name__, template_folder="templates")
-CORS(app)  # ✅ ENABLE CORS FOR ALL ROUTES
+app = Flask(__name__)

-# Constants
 ZIP_URL = "https://docs.google.com/spreadsheets/d/1S6WwM05O277npQbaiNk-jZlXK3TdooSyWtqaWUvAI78/export?format=zip"
 XLSX_URL = "https://docs.google.com/spreadsheets/d/1S6WwM05O277npQbaiNk-jZlXK3TdooSyWtqaWUvAI78/export?format=xlsx"
-ZIP_FILE = "Trackerhub Reup.zip"
-XLSX_FILE = "artists.xlsx"
-EXTRACT_FOLDER = "sheet"
-HTML_FILE = os.path.join(EXTRACT_FOLDER, "Artists.html")
-CSV_FILE = "artists.csv"

-# Artist names to exclude (no emojis, trimmed)
-EXCLUDE_ARTISTS = {
-    "🤖 AI Models",
-    "🤖 Lawson",
-    "Comps & Edits",
+ZIP_FILENAME = "Trackerhub.zip"
+HTML_FILENAME = "Artists.html"
+CSV_FILENAME = "artists.csv"
+XLSX_FILENAME = "artists.xlsx"
+
+exclude_names = {
+    "AI Models",
+    "Lawson",
+    "BPM Tracker",
    "Worst Comps & Edits",
-    "Yedits",
    "Allegations",
    "Rap Disses Timeline",
-    "Underground Artists"
+    "Underground Artists",
 }

-def remove_emojis(text):
-    emoji_pattern = re.compile(
-        r'\s*['
-        '\U0001F600-\U0001F64F'
-        '\U0001F300-\U0001F5FF'
-        '\U0001F680-\U0001F6FF'
-        '\U0001F1E0-\U0001F1FF'
-        '\u2702-\u27B0'
-        '\u24C2-\U0001F251'
-        ']\s*',
-        flags=re.UNICODE
-    )
-    cleaned_text = emoji_pattern.sub('', text)
-    return cleaned_text.strip()
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"

-def fetch_and_process():
+# URLs to archive on changes — update these to your actual hosted domain
+BASE_URL = "http://localhost:5000"  # Change this to your public domain when deployed
+
+ARCHIVE_URLS = [
+    f"{BASE_URL}/",
+    f"{BASE_URL}/index.html/",
+    f"{BASE_URL}/artists.html",
+    f"{BASE_URL}/artists.csv",
+    f"{BASE_URL}/artists.xlsx",
+]
+
+DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL")
+
+def clean_artist_name(text):
+    return re.sub(r'[⭐🤖🎭\u2B50\uFE0F]', '', text).strip()
+
+def force_star_flag(starred=True):
+    return "Yes" if starred else "No"
+
+def download_zip_and_extract_html():
+    print("🔄 Downloading ZIP...")
+    r = requests.get(ZIP_URL)
+    r.raise_for_status()
+    with open(ZIP_FILENAME, "wb") as f:
+        f.write(r.content)
+    print(f"✅ Saved ZIP as {ZIP_FILENAME}")
+
+    with zipfile.ZipFile(ZIP_FILENAME, "r") as z:
+        with z.open(HTML_FILENAME) as html_file:
+            html_content = html_file.read()
+            with open(HTML_FILENAME, "wb") as f:
+                f.write(html_content)
+    print(f"✅ Extracted {HTML_FILENAME}")
+
+def download_xlsx():
+    print("🔄 Downloading XLSX...")
+    r = requests.get(XLSX_URL)
+    r.raise_for_status()
+    with open(XLSX_FILENAME, "wb") as f:
+        f.write(r.content)
+    print(f"✅ Saved XLSX as {XLSX_FILENAME}")
+
+def generate_csv():
+    print("📝 Generating CSV...")
+    with open(HTML_FILENAME, "r", encoding="utf-8") as f:
+        soup = BeautifulSoup(f, "html.parser")
+
+    rows = soup.select("table.waffle tbody tr")[3:]  # skip headers and Discord
+
+    data = []
+    starring = True
+
+    for row in rows:
+        cells = row.find_all("td")
+        if len(cells) < 4:
+            continue
+
+        link_tag = cells[0].find("a")
+        artist_name_raw = link_tag.get_text(strip=True) if link_tag else cells[0].get_text(strip=True)
+        artist_url = link_tag["href"] if link_tag else ""
+        if not artist_url:
+            continue
+
+        if "AI Models" in artist_name_raw:
+            starring = False
+
+        artist_name_clean = clean_artist_name(artist_name_raw)
+        if artist_name_clean in exclude_names:
+            continue
+
+        if "🚩" in artist_name_raw:
+            continue
+
+        best = force_star_flag(starring)
+        credit = cells[1].get_text(strip=True)
+        updated = cells[2].get_text(strip=True)
+        links_work = cells[3].get_text(strip=True)
+
+        data.append([artist_name_clean, artist_url, credit, links_work, updated, best])
+
+    with open(CSV_FILENAME, "w", newline='', encoding="utf-8") as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(["Artist Name", "URL", "Credit", "Links Work", "Updated", "Best"])
+        writer.writerows(data)
+
+    print(f"✅ CSV saved as {CSV_FILENAME}")
+
+def hash_file(filename):
+    hasher = hashlib.sha256()
+    with open(filename, "rb") as f:
+        buf = f.read()
+        hasher.update(buf)
+    return hasher.hexdigest()
+
+def archive_url(url):
+    print(f"🌐 Archiving {url} ...")
+    headers = {"User-Agent": USER_AGENT}
    try:
-        print("[*] Downloading ZIP...")
-        r = requests.get(ZIP_URL)
-        with open(ZIP_FILE, "wb") as f:
-            f.write(r.content)
-
-        print("[*] Extracting ZIP...")
-        with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
-            zip_ref.extractall(EXTRACT_FOLDER)
-
-        print("[*] Downloading XLSX...")
-        r = requests.get(XLSX_URL)
-        with open(XLSX_FILE, "wb") as f:
-            f.write(r.content)
-
-        print("[*] Parsing HTML...")
-        with open(HTML_FILE, "r", encoding="utf-8") as f:
-            soup = BeautifulSoup(f, "html.parser")
-
-        table = soup.find("table", class_="waffle")
-        if not table:
-            print("[!] Table not found.")
-            return
-
-        rows = table.find_all("tr")[4:]
-
-        data = []
-        for row in rows:
-            cols = row.find_all("td")
-            if len(cols) < 4:
-                continue
-
-            artist_cell = cols[0]
-            a_tag = artist_cell.find("a")
-            artist_name_raw = a_tag.text.strip() if a_tag else artist_cell.text.strip()
-            artist_name_clean = remove_emojis(artist_name_raw.replace('"', '')).strip()
-
-            if artist_name_clean in EXCLUDE_ARTISTS:
-                continue
-
-            artist_url = a_tag['href'] if a_tag and a_tag.has_attr('href') else ""
-            credits = cols[1].get_text(strip=True)
-            updated = cols[2].get_text(strip=True)
-            links_work = cols[3].get_text(strip=True)
-
-            cleaned_row = [
-                artist_name_clean,
-                remove_emojis(artist_url.replace('"', '')),
-                remove_emojis(credits.replace('"', '')),
-                remove_emojis(updated.replace('"', '')),
-                remove_emojis(links_work.replace('"', ''))
-            ]
-
-            if all(cell for cell in cleaned_row):
-                data.append(cleaned_row)
-
-        data.sort(key=lambda row: row[0].lower())
-
-        print(f"[*] Writing {len(data)} rows to CSV...")
-        with open(CSV_FILE, "w", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f)
-            writer.writerow(["artist name", "URL", "credits", "updated", "links work"])
-            writer.writerows(data)
-
-        print("[✓] Done! CSV and XLSX updated.")
-
+        resp = requests.get(f"https://web.archive.org/save/{url}", headers=headers, timeout=30)
+        if resp.status_code == 200:
+            print(f"✅ Archived {url}")
+        else:
+            print(f"⚠️ Failed to archive {url}, status code {resp.status_code}")
    except Exception as e:
-        print(f"[!] Error: {e}")
+        print(f"⚠️ Exception archiving {url}: {e}")
+
+def archive_all_urls():
+    for url in ARCHIVE_URLS:
+        delay = 10 + random.uniform(-3, 3)
+        time.sleep(delay)
+        archive_url(url)
+
+def read_csv_to_dict(filename):
+    """Read CSV into dict with artist_name as key, storing relevant fields."""
+    d = {}
+    with open(filename, newline='', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            d[row["Artist Name"]] = row
+    return d
+
+def detect_changes(old_data, new_data):
+    """
+    Compare old and new data dictionaries.
+    Returns a list of strings describing changes.
+    """
+    changes = []
+
+    old_keys = set(old_data.keys())
+    new_keys = set(new_data.keys())
+
+    removed = old_keys - new_keys
+    added = new_keys - old_keys
+    common = old_keys & new_keys
+
+    for artist in removed:
+        changes.append(f"❌ Removed: **{artist}**")
+
+    for artist in added:
+        changes.append(f"➕ Added: **{artist}**")
+
+    for artist in common:
+        old_row = old_data[artist]
+        new_row = new_data[artist]
+        # Check if URL changed
+        if old_row["URL"] != new_row["URL"]:
+            changes.append(f"🔗 Link changed for **{artist}**")
+        # Check other fields if needed (Credit, Updated, etc.)
+        if old_row["Credit"] != new_row["Credit"]:
+            changes.append(f"✏️ Credit changed for **{artist}**")
+        if old_row["Links Work"] != new_row["Links Work"]:
+            changes.append(f"🔄 Links Work status changed for **{artist}**")
+        if old_row["Updated"] != new_row["Updated"]:
+            changes.append(f"🕒 Updated date changed for **{artist}**")
+        if old_row["Best"] != new_row["Best"]:
+            changes.append(f"⭐ Best flag changed for **{artist}**")
+
+    return changes
+
+def send_discord_message(content):
+    if not DISCORD_WEBHOOK_URL:
+        print("⚠️ Discord webhook URL not set in env")
+        return
+
+    headers = {"Content-Type": "application/json"}
+    data = {"content": content}
+
+    try:
+        resp = requests.post(DISCORD_WEBHOOK_URL, headers=headers, data=json.dumps(data), timeout=10)
+        if resp.status_code in (200, 204):
+            print("✅ Discord notification sent")
+        else:
+            print(f"⚠️ Failed to send Discord notification, status code {resp.status_code}")
+    except Exception as e:
+        print(f"⚠️ Exception sending Discord notification: {e}")
+
+def update_loop():
+    last_csv_hash = None
+    last_csv_data = {}

-def background_updater():
    while True:
-        fetch_and_process()
-        time.sleep(600)
+        try:
+            download_zip_and_extract_html()
+            download_xlsx()
+            generate_csv()

-# Routes
-@app.route("/")
-@app.route("/index")
-@app.route("/index.html")
-def index():
-    return render_template("index.html")
+            current_hash = hash_file(CSV_FILENAME)
+            current_data = read_csv_to_dict(CSV_FILENAME)

-@app.route('/favicon.png')
-def serve_favicon():
-    return send_from_directory(app.template_folder, 'favicon.png')
+            if last_csv_hash is None:
+                print("ℹ️ Initial CSV hash stored.")
+            elif current_hash != last_csv_hash:
+                print("🔔 CSV has changed! Archiving URLs...")

-@app.route("/artists.csv")
-def serve_csv():
-    if os.path.exists(CSV_FILE):
-        return send_file(CSV_FILE, mimetype="text/csv", as_attachment=False)
-    return "CSV not ready yet.", 503
+                changes = detect_changes(last_csv_data, current_data)
+                if changes:
+                    message = "**CSV Update Detected:**\n" + "\n".join(changes)
+                    send_discord_message(message)
+                else:
+                    print("ℹ️ No detectable content changes found.")

-@app.route("/artists.xlsx")
-def serve_xlsx():
-    if os.path.exists(XLSX_FILE):
-        return send_file(XLSX_FILE, mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", as_attachment=False)
-    return "XLSX not ready yet.", 503
+                archive_all_urls()
+            else:
+                print("ℹ️ CSV unchanged. No archiving needed.")
+
+            last_csv_hash = current_hash
+            last_csv_data = current_data
+
+        except Exception as e:
+            print(f"⚠️ Error updating files: {e}")
+
+        time.sleep(600)  # 10 minutes

@app.route("/artists.html")
 def serve_artists_html():
-    if os.path.exists(HTML_FILE):
-        return send_file(HTML_FILE, mimetype="text/html")
-    return "HTML file not found.", 404
+    return send_file(HTML_FILENAME, mimetype="text/html")

-@app.route("/<path:path>")
-def catch_all(path):
-    if os.path.exists(CSV_FILE):
-        return send_file(CSV_FILE, mimetype="text/csv", as_attachment=False)
-    return "CSV not ready yet.", 503
+@app.route("/artists.csv")
+def serve_artists_csv():
+    return send_file(CSV_FILENAME, mimetype="text/csv")

-@app.route('/_next/<path:filename>')
-def serve_next(filename):
-    return send_from_directory(os.path.join(app.template_folder, '_next'), filename)
+@app.route("/artists.xlsx")
+def serve_artists_xlsx():
+    return send_file(XLSX_FILENAME, mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
+
+@app.route("/")
+@app.route("/index.html/")
+def serve_index():
+    # Simple index page linking to your files
+    return """
+    <html>
+        <head><title>Artists Data</title></head>
+        <body>
+            <h1>Artists Data</h1>
+            <ul>
+                <li><a href="/artists.html">Artists.html</a></li>
+                <li><a href="/artists.csv">artists.csv</a></li>
+                <li><a href="/artists.xlsx">artists.xlsx</a></li>
+            </ul>
+        </body>
+    </html>
+    """

 if __name__ == "__main__":
-    thread = threading.Thread(target=background_updater, daemon=True)
-    thread.start()
+    threading.Thread(target=update_loop, daemon=True).start()
+    try:
+        download_zip_and_extract_html()
+        download_xlsx()
+        generate_csv()
+    except Exception as e:
+        print(f"⚠️ Initial update failed: {e}")
+
    app.run(host="0.0.0.0", port=5000)