import re import csv import sys from pathlib import Path from playwright.sync_api import sync_playwright, Page PROFILES_FILE = Path("profiles.txt") OUTPUT_CSV = Path("output.csv") OUTPUT_MD = Path("output.md") BROWSER_PROFILE = Path("browser_profile") POSTS_PER_PROFILE = 5 def extract_hashtags(text: str) -> list[str]: seen = set() result = [] for tag in re.findall(r"#\w+", text): if tag not in seen: seen.add(tag) result.append(tag) return result def extract_mentions(text: str) -> list[str]: seen = set() result = [] for mention in re.findall(r"@\w+", text): if mention not in seen: seen.add(mention) result.append(mention) return result def profile_slug_from_url(url: str) -> str: return url.rstrip("/").split("/")[-1] def read_profiles() -> list[str]: urls = [] for line in PROFILES_FILE.read_text().splitlines(): line = line.strip() if not line: continue # Lines may be prefixed with a number and tab parts = line.split("\t") url = parts[-1].strip() if url.startswith("http"): urls.append(url) return urls