insta_scraper/scraper.py

import re
import csv
import sys
from pathlib import Path
from playwright.sync_api import sync_playwright, Page

PROFILES_FILE = Path("profiles.txt")
OUTPUT_CSV = Path("output.csv")
OUTPUT_MD = Path("output.md")
BROWSER_PROFILE = Path("browser_profile")
POSTS_PER_PROFILE = 5


def extract_hashtags(text: str) -> list[str]:
    seen = set()
    result = []
    for tag in re.findall(r"#\w+", text):
        if tag not in seen:
            seen.add(tag)
            result.append(tag)
    return result


def extract_mentions(text: str) -> list[str]:
    seen = set()
    result = []
    for mention in re.findall(r"@\w+", text):
        if mention not in seen:
            seen.add(mention)
            result.append(mention)
    return result


def profile_slug_from_url(url: str) -> str:
    return url.rstrip("/").split("/")[-1]


def read_profiles() -> list[str]:
    urls = []
    for line in PROFILES_FILE.read_text().splitlines():
        line = line.strip()
        if not line:
            continue
        # Lines may be prefixed with a number and tab
        parts = line.split("\t")
        url = parts[-1].strip()
        if url.startswith("http"):
            urls.append(url)
    return urls