const cheerio = require("cheerio");

export interface ExtractorOptions {
  removeSelectors?: string[];
  mainContentSelectors?: string[];
}

export function extractMainContent(
  html: string,
  options: ExtractorOptions = {}
): string {
  const $ = cheerio.load(html);

  // 기본적으로 많이 쓰는 불필요 요소
  const defaultRemove = [
    "nav",
    "aside",
    "footer",
    "header",
    ".ads",
    ".advertisement",
    ".sidebar",
    ".popup",
    ".banner",
    '[role="navigation"]',
    '[role="banner"]',
    '[role="complementary"]',
    '[role="contentinfo"]',
  ];
  const removeSelectors = [
    ...defaultRemove,
    ...(options.removeSelectors || []),
  ];
  removeSelectors.forEach((sel) => $(sel).remove());

  // 주요 콘텐츠 추출 (기본: article, main, #content 등)
  const mainSelectors = options.mainContentSelectors || [
    "article",
    "main",
    "#content",
    ".post",
    ".entry-content",
  ];
  let mainHtml = "";
  for (const sel of mainSelectors) {
    if ($(sel).length) {
      mainHtml = $(sel).html() || "";
      break;
    }
  }
  if (!mainHtml) mainHtml = $("body").html() || "";

  // 텍스트만 추출, 구조 보존(heading, p, ul, ol, li)
  const $main = cheerio.load(mainHtml);
  let result = "";
  $main("h1,h2,h3,h4,h5,h6,p,ul,ol,li").each((_: any, el: any) => {
    result += $main(el).text().trim() + "\n";
  });
  return result.trim();
}
