"""Playwright-based page fetcher — Crawl4AI-compatible fallback for Python 3.8."""

from __future__ import annotations

import logging
import re
from html import unescape

logger = logging.getLogger(__name__)


def _html_to_text(html: str) -> str:
    text = re.sub(r"(?is)<(script|style|nav|footer|header|aside|ins)[^>]*>.*?</\1>", " ", html)
    text = re.sub(r"(?is)<br\s*/?>", "\n", text)
    text = re.sub(r"(?is)</?(p|div|li|h[1-6]|tr|td|th|article|section)[^>]*>", "\n", text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = unescape(text)
    text = re.sub(r"[ \t]+\n", "\n", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def fetch_page_playwright(url: str, *, timeout_ms: int = 30000) -> tuple[str, str]:
    """
    Fetch URL with headless Chromium via Playwright.
    Returns (title, markdown-ish plain text).
    """
    from playwright.sync_api import sync_playwright

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        try:
            page = browser.new_page()
            page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
            page.wait_for_timeout(500)
            title = page.title() or url
            html = page.content()
        finally:
            browser.close()

    return title[:300], _html_to_text(html)


def playwright_available() -> bool:
    try:
        import playwright  # noqa: F401

        return True
    except ImportError:
        return False
