Private
Public Access
1
0

feat: add parser functions with tests

This commit is contained in:
belisards
2026-03-29 16:52:40 -03:00
parent c5a01190c1
commit cdd71deb17
4 changed files with 77 additions and 0 deletions

49
scraper.py Normal file
View File

@@ -0,0 +1,49 @@
import re
import csv
import sys
from pathlib import Path
from playwright.sync_api import sync_playwright, Page
PROFILES_FILE = Path("profiles.txt")
OUTPUT_CSV = Path("output.csv")
OUTPUT_MD = Path("output.md")
BROWSER_PROFILE = Path("browser_profile")
POSTS_PER_PROFILE = 5
def extract_hashtags(text: str) -> list[str]:
seen = set()
result = []
for tag in re.findall(r"#\w+", text):
if tag not in seen:
seen.add(tag)
result.append(tag)
return result
def extract_mentions(text: str) -> list[str]:
seen = set()
result = []
for mention in re.findall(r"@\w+", text):
if mention not in seen:
seen.add(mention)
result.append(mention)
return result
def profile_slug_from_url(url: str) -> str:
return url.rstrip("/").split("/")[-1]
def read_profiles() -> list[str]:
urls = []
for line in PROFILES_FILE.read_text().splitlines():
line = line.strip()
if not line:
continue
# Lines may be prefixed with a number and tab
parts = line.split("\t")
url = parts[-1].strip()
if url.startswith("http"):
urls.append(url)
return urls