This commit is contained in:
Eduard Prigoana 2025-08-22 04:42:18 +03:00
parent be789cb732
commit c23eb924c3
85 changed files with 7090 additions and 253 deletions

View file

@ -1,50 +1,70 @@
from bs4 import BeautifulSoup
# parser.py
import csv
import logging
from config import HTML_FILENAME, CSV_FILENAME, exclude_names
from bs4 import BeautifulSoup
from config import CSV_FILENAME, HTML_FILENAME, exclude_names
from utils import clean_artist_name, force_star_flag
logger = logging.getLogger(__name__)
def generate_csv():
print("📝 Generating CSV...")
with open(HTML_FILENAME, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
logger.info(f"📝 Generating {CSV_FILENAME} from {HTML_FILENAME}...")
try:
with open(HTML_FILENAME, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
except FileNotFoundError:
logger.error(f"{HTML_FILENAME} not found. Cannot generate CSV.")
return
rows = soup.select("table.waffle tbody tr")[3:]
table_body = soup.select_one("table.waffle tbody")
if not table_body:
logger.error("❌ Could not find the table body in HTML. Cannot generate CSV.")
return
rows = table_body.select("tr")
data = []
starring = True
starring_section = True
for row in rows:
for row in rows[3:]:
cells = row.find_all("td")
if len(cells) < 4:
continue
# Always take the artist name from the column text
artist_name_raw = cells[0].get_text(strip=True)
# Only use the <a> for the URL (if it exists)
link_tag = cells[0].find("a")
artist_url = link_tag["href"] if link_tag else ""
if not artist_url:
artist_url = link_tag.get("href") if link_tag else ""
if not artist_name_raw or not artist_url:
continue
if "AI Models" in artist_name_raw:
starring = False
starring_section = False
artist_name_clean = clean_artist_name(artist_name_raw)
if artist_name_clean in exclude_names or "🚩" in artist_name_raw:
continue
best = force_star_flag(starring)
credit = cells[1].get_text(strip=True)
updated = cells[2].get_text(strip=True)
links_work = cells[3].get_text(strip=True)
data.append(
[
artist_name_clean,
artist_url,
cells[1].get_text(strip=True),
cells[3].get_text(strip=True),
cells[2].get_text(strip=True),
force_star_flag(starring_section),
]
)
data.append([artist_name_clean, artist_url, credit, links_work, updated, best])
with open(CSV_FILENAME, "w", newline='', encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
writer.writerow(["Artist Name", "URL", "Credit", "Links Work", "Updated", "Best"])
writer.writerows(data)
print(f"✅ CSV saved as {CSV_FILENAME}")
try:
with open(CSV_FILENAME, "w", newline="", encoding="utf-8") as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
writer.writerow(
["Artist Name", "URL", "Credit", "Links Work", "Updated", "Best"]
)
writer.writerows(data)
logger.info(f"✅ Generated {CSV_FILENAME} with {len(data)} rows.")
except IOError as e:
logger.error(f"❌ Failed to write CSV file {CSV_FILENAME}: {e}")