feat: add parser functions with tests
This commit is contained in:
49
scraper.py
Normal file
49
scraper.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import re
|
||||
import csv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from playwright.sync_api import sync_playwright, Page
|
||||
|
||||
PROFILES_FILE = Path("profiles.txt")
|
||||
OUTPUT_CSV = Path("output.csv")
|
||||
OUTPUT_MD = Path("output.md")
|
||||
BROWSER_PROFILE = Path("browser_profile")
|
||||
POSTS_PER_PROFILE = 5
|
||||
|
||||
|
||||
def extract_hashtags(text: str) -> list[str]:
|
||||
seen = set()
|
||||
result = []
|
||||
for tag in re.findall(r"#\w+", text):
|
||||
if tag not in seen:
|
||||
seen.add(tag)
|
||||
result.append(tag)
|
||||
return result
|
||||
|
||||
|
||||
def extract_mentions(text: str) -> list[str]:
|
||||
seen = set()
|
||||
result = []
|
||||
for mention in re.findall(r"@\w+", text):
|
||||
if mention not in seen:
|
||||
seen.add(mention)
|
||||
result.append(mention)
|
||||
return result
|
||||
|
||||
|
||||
def profile_slug_from_url(url: str) -> str:
|
||||
return url.rstrip("/").split("/")[-1]
|
||||
|
||||
|
||||
def read_profiles() -> list[str]:
|
||||
urls = []
|
||||
for line in PROFILES_FILE.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# Lines may be prefixed with a number and tab
|
||||
parts = line.split("\t")
|
||||
url = parts[-1].strip()
|
||||
if url.startswith("http"):
|
||||
urls.append(url)
|
||||
return urls
|
||||
Reference in New Issue
Block a user