feat: add parser functions with tests

2026-03-29 16:52:40 -03:00
parent c5a01190c1
commit cdd71deb17
4 changed files with 77 additions and 0 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,49 @@
+import re
+import csv
+import sys
+from pathlib import Path
+from playwright.sync_api import sync_playwright, Page
+
+PROFILES_FILE = Path("profiles.txt")
+OUTPUT_CSV = Path("output.csv")
+OUTPUT_MD = Path("output.md")
+BROWSER_PROFILE = Path("browser_profile")
+POSTS_PER_PROFILE = 5
+
+
+def extract_hashtags(text: str) -> list[str]:
+    seen = set()
+    result = []
+    for tag in re.findall(r"#\w+", text):
+        if tag not in seen:
+            seen.add(tag)
+            result.append(tag)
+    return result
+
+
+def extract_mentions(text: str) -> list[str]:
+    seen = set()
+    result = []
+    for mention in re.findall(r"@\w+", text):
+        if mention not in seen:
+            seen.add(mention)
+            result.append(mention)
+    return result
+
+
+def profile_slug_from_url(url: str) -> str:
+    return url.rstrip("/").split("/")[-1]
+
+
+def read_profiles() -> list[str]:
+    urls = []
+    for line in PROFILES_FILE.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        # Lines may be prefixed with a number and tab
+        parts = line.split("\t")
+        url = parts[-1].strip()
+        if url.startswith("http"):
+            urls.append(url)
+    return urls