feat: add parser functions with tests

2026-03-29 16:52:40 -03:00
parent c5a01190c1
commit cdd71deb17
4 changed files with 77 additions and 0 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,7 @@ dependencies = [
    "playwright>=1.40.0",
 ]
 [project.optional-dependencies]
 dev = [
    "pytest>=7.0.0",
 ]
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,49 @@
 import re
 import csv
 import sys
 from pathlib import Path
 from playwright.sync_api import sync_playwright, Page
 PROFILES_FILE = Path("profiles.txt")
 OUTPUT_CSV = Path("output.csv")
 OUTPUT_MD = Path("output.md")
 BROWSER_PROFILE = Path("browser_profile")
 POSTS_PER_PROFILE = 5
 def extract_hashtags(text: str) -> list[str]:
    seen = set()
    result = []
    for tag in re.findall(r"#\w+", text):
        if tag not in seen:
            seen.add(tag)
            result.append(tag)
    return result
 def extract_mentions(text: str) -> list[str]:
    seen = set()
    result = []
    for mention in re.findall(r"@\w+", text):
        if mention not in seen:
            seen.add(mention)
            result.append(mention)
    return result
 def profile_slug_from_url(url: str) -> str:
    return url.rstrip("/").split("/")[-1]
 def read_profiles() -> list[str]:
    urls = []
    for line in PROFILES_FILE.read_text().splitlines():
        line = line.strip()
        if not line:
            continue
        # Lines may be prefixed with a number and tab
        parts = line.split("\t")
        url = parts[-1].strip()
        if url.startswith("http"):
            urls.append(url)
    return urls
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -0,0 +1,24 @@
 import sys, os
 sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 from scraper import extract_hashtags, extract_mentions, profile_slug_from_url
 def test_extract_hashtags_basic():
    assert extract_hashtags("Hello #world #foo") == ["#world", "#foo"]
 def test_extract_hashtags_empty():
    assert extract_hashtags("No tags here") == []
 def test_extract_hashtags_deduplicates():
    assert extract_hashtags("#foo #foo #bar") == ["#foo", "#bar"]
 def test_extract_mentions_basic():
    assert extract_mentions("Hey @alice and @bob") == ["@alice", "@bob"]
 def test_extract_mentions_empty():
    assert extract_mentions("No mentions") == []
 def test_profile_slug_from_url():
    assert profile_slug_from_url("https://www.instagram.com/licmuunisul/") == "licmuunisul"
 def test_profile_slug_trailing_slash():
    assert profile_slug_from_url("https://www.instagram.com/licmuunisul") == "licmuunisul"