From cdd71deb1780459fbe5013947767efb57c55eb7f Mon Sep 17 00:00:00 2001 From: belisards Date: Sun, 29 Mar 2026 16:52:40 -0300 Subject: [PATCH] feat: add parser functions with tests --- pyproject.toml | 4 ++++ scraper.py | 49 +++++++++++++++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/test_parsers.py | 24 +++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 scraper.py create mode 100644 tests/__init__.py create mode 100644 tests/test_parsers.py diff --git a/pyproject.toml b/pyproject.toml index b4ca1c6..6f48b0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,3 +6,7 @@ dependencies = [ "playwright>=1.40.0", ] +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", +] diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..5fe3dff --- /dev/null +++ b/scraper.py @@ -0,0 +1,49 @@ +import re +import csv +import sys +from pathlib import Path +from playwright.sync_api import sync_playwright, Page + +PROFILES_FILE = Path("profiles.txt") +OUTPUT_CSV = Path("output.csv") +OUTPUT_MD = Path("output.md") +BROWSER_PROFILE = Path("browser_profile") +POSTS_PER_PROFILE = 5 + + +def extract_hashtags(text: str) -> list[str]: + seen = set() + result = [] + for tag in re.findall(r"#\w+", text): + if tag not in seen: + seen.add(tag) + result.append(tag) + return result + + +def extract_mentions(text: str) -> list[str]: + seen = set() + result = [] + for mention in re.findall(r"@\w+", text): + if mention not in seen: + seen.add(mention) + result.append(mention) + return result + + +def profile_slug_from_url(url: str) -> str: + return url.rstrip("/").split("/")[-1] + + +def read_profiles() -> list[str]: + urls = [] + for line in PROFILES_FILE.read_text().splitlines(): + line = line.strip() + if not line: + continue + # Lines may be prefixed with a number and tab + parts = line.split("\t") + url = parts[-1].strip() + if url.startswith("http"): + urls.append(url) + return urls diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_parsers.py b/tests/test_parsers.py new file mode 100644 index 0000000..fe7ddcf --- /dev/null +++ b/tests/test_parsers.py @@ -0,0 +1,24 @@ +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) +from scraper import extract_hashtags, extract_mentions, profile_slug_from_url + +def test_extract_hashtags_basic(): + assert extract_hashtags("Hello #world #foo") == ["#world", "#foo"] + +def test_extract_hashtags_empty(): + assert extract_hashtags("No tags here") == [] + +def test_extract_hashtags_deduplicates(): + assert extract_hashtags("#foo #foo #bar") == ["#foo", "#bar"] + +def test_extract_mentions_basic(): + assert extract_mentions("Hey @alice and @bob") == ["@alice", "@bob"] + +def test_extract_mentions_empty(): + assert extract_mentions("No mentions") == [] + +def test_profile_slug_from_url(): + assert profile_slug_from_url("https://www.instagram.com/licmuunisul/") == "licmuunisul" + +def test_profile_slug_trailing_slash(): + assert profile_slug_from_url("https://www.instagram.com/licmuunisul") == "licmuunisul"