From cdd71deb1780459fbe5013947767efb57c55eb7f Mon Sep 17 00:00:00 2001
From: belisards <adrianobf@gmail.com>
Date: Sun, 29 Mar 2026 16:52:40 -0300
Subject: [PATCH] feat: add parser functions with tests

---
 pyproject.toml        |  4 ++++
 scraper.py            | 49 +++++++++++++++++++++++++++++++++++++++++++
 tests/__init__.py     |  0
 tests/test_parsers.py | 24 +++++++++++++++++++++
 4 files changed, 77 insertions(+)
 create mode 100644 scraper.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_parsers.py

diff --git a/pyproject.toml b/pyproject.toml
index b4ca1c6..6f48b0c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,3 +6,7 @@ dependencies = [
     "playwright>=1.40.0",
 ]
 
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+]
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..5fe3dff
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,49 @@
+import re
+import csv
+import sys
+from pathlib import Path
+from playwright.sync_api import sync_playwright, Page
+
+PROFILES_FILE = Path("profiles.txt")
+OUTPUT_CSV = Path("output.csv")
+OUTPUT_MD = Path("output.md")
+BROWSER_PROFILE = Path("browser_profile")
+POSTS_PER_PROFILE = 5
+
+
+def extract_hashtags(text: str) -> list[str]:
+    seen = set()
+    result = []
+    for tag in re.findall(r"#\w+", text):
+        if tag not in seen:
+            seen.add(tag)
+            result.append(tag)
+    return result
+
+
+def extract_mentions(text: str) -> list[str]:
+    seen = set()
+    result = []
+    for mention in re.findall(r"@\w+", text):
+        if mention not in seen:
+            seen.add(mention)
+            result.append(mention)
+    return result
+
+
+def profile_slug_from_url(url: str) -> str:
+    return url.rstrip("/").split("/")[-1]
+
+
+def read_profiles() -> list[str]:
+    urls = []
+    for line in PROFILES_FILE.read_text().splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        # Lines may be prefixed with a number and tab
+        parts = line.split("\t")
+        url = parts[-1].strip()
+        if url.startswith("http"):
+            urls.append(url)
+    return urls
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_parsers.py b/tests/test_parsers.py
new file mode 100644
index 0000000..fe7ddcf
--- /dev/null
+++ b/tests/test_parsers.py
@@ -0,0 +1,24 @@
+import sys, os
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+from scraper import extract_hashtags, extract_mentions, profile_slug_from_url
+
+def test_extract_hashtags_basic():
+    assert extract_hashtags("Hello #world #foo") == ["#world", "#foo"]
+
+def test_extract_hashtags_empty():
+    assert extract_hashtags("No tags here") == []
+
+def test_extract_hashtags_deduplicates():
+    assert extract_hashtags("#foo #foo #bar") == ["#foo", "#bar"]
+
+def test_extract_mentions_basic():
+    assert extract_mentions("Hey @alice and @bob") == ["@alice", "@bob"]
+
+def test_extract_mentions_empty():
+    assert extract_mentions("No mentions") == []
+
+def test_profile_slug_from_url():
+    assert profile_slug_from_url("https://www.instagram.com/licmuunisul/") == "licmuunisul"
+
+def test_profile_slug_trailing_slash():
+    assert profile_slug_from_url("https://www.instagram.com/licmuunisul") == "licmuunisul"