feat: add parser functions with tests
This commit is contained in:
@@ -6,3 +6,7 @@ dependencies = [
|
|||||||
"playwright>=1.40.0",
|
"playwright>=1.40.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=7.0.0",
|
||||||
|
]
|
||||||
|
|||||||
49
scraper.py
Normal file
49
scraper.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import re
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from playwright.sync_api import sync_playwright, Page
|
||||||
|
|
||||||
|
PROFILES_FILE = Path("profiles.txt")
|
||||||
|
OUTPUT_CSV = Path("output.csv")
|
||||||
|
OUTPUT_MD = Path("output.md")
|
||||||
|
BROWSER_PROFILE = Path("browser_profile")
|
||||||
|
POSTS_PER_PROFILE = 5
|
||||||
|
|
||||||
|
|
||||||
|
def extract_hashtags(text: str) -> list[str]:
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for tag in re.findall(r"#\w+", text):
|
||||||
|
if tag not in seen:
|
||||||
|
seen.add(tag)
|
||||||
|
result.append(tag)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_mentions(text: str) -> list[str]:
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for mention in re.findall(r"@\w+", text):
|
||||||
|
if mention not in seen:
|
||||||
|
seen.add(mention)
|
||||||
|
result.append(mention)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def profile_slug_from_url(url: str) -> str:
|
||||||
|
return url.rstrip("/").split("/")[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def read_profiles() -> list[str]:
|
||||||
|
urls = []
|
||||||
|
for line in PROFILES_FILE.read_text().splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
# Lines may be prefixed with a number and tab
|
||||||
|
parts = line.split("\t")
|
||||||
|
url = parts[-1].strip()
|
||||||
|
if url.startswith("http"):
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
24
tests/test_parsers.py
Normal file
24
tests/test_parsers.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import sys, os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||||
|
from scraper import extract_hashtags, extract_mentions, profile_slug_from_url
|
||||||
|
|
||||||
|
def test_extract_hashtags_basic():
|
||||||
|
assert extract_hashtags("Hello #world #foo") == ["#world", "#foo"]
|
||||||
|
|
||||||
|
def test_extract_hashtags_empty():
|
||||||
|
assert extract_hashtags("No tags here") == []
|
||||||
|
|
||||||
|
def test_extract_hashtags_deduplicates():
|
||||||
|
assert extract_hashtags("#foo #foo #bar") == ["#foo", "#bar"]
|
||||||
|
|
||||||
|
def test_extract_mentions_basic():
|
||||||
|
assert extract_mentions("Hey @alice and @bob") == ["@alice", "@bob"]
|
||||||
|
|
||||||
|
def test_extract_mentions_empty():
|
||||||
|
assert extract_mentions("No mentions") == []
|
||||||
|
|
||||||
|
def test_profile_slug_from_url():
|
||||||
|
assert profile_slug_from_url("https://www.instagram.com/licmuunisul/") == "licmuunisul"
|
||||||
|
|
||||||
|
def test_profile_slug_trailing_slash():
|
||||||
|
assert profile_slug_from_url("https://www.instagram.com/licmuunisul") == "licmuunisul"
|
||||||
Reference in New Issue
Block a user