"use strict";
/**
 * @author Zerobyte
 * @license MIT
 */
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseWiki = exports.docGetLangSectionByTitle = exports.wikiMWElementParser = exports.identifySubsectionHeader = exports.MWMeaningSectionParser = exports.getElementFilteredText = exports.getWiktionaryPage = exports.WikiMediaResource = void 0;
const node_fetch2_1 = require("node-fetch2");
const jsdom_1 = require("jsdom");
class WikiMediaResource extends String {
    constructor(str = "") {
        super(str);
        this.raw = str;
    }
    async getFullMedia() {
        // valid URL?
        try {
            new URL("https://en.wiktionary.org/" + this.raw);
        }
        catch (e) {
            return null;
        }
        // try fetching
        var res;
        try {
            res = await node_fetch2_1.default("https://en.wiktionary.org/" + this.raw);
        }
        catch (e) {
            return null;
        }
        var html = await res.text();
        var doc = new jsdom_1.JSDOM(html).window.document;
        var url = doc.querySelector(".fullMedia .internal")?.getAttribute("href");
        if (!url?.startsWith("https")) {
            url = url?.replace("//", "https://");
        }
        return url;
    }
}
exports.WikiMediaResource = WikiMediaResource;
/**
 * Search for wiktionary pages
 * @param query The search query
 * @param subdomain The wiktionary subdomain
 * @returns Link to a fitting wiktionary page
 */
async function getWiktionaryPage(query, subdomain = "en") {
    /**
     *
     * @param word The search query
     * @param array An array of URLs
     * @returns The most likely best fitting URL; Returns null if the array has a length below 1
     */
    function getMostFittingSearchResultURL(word, array) {
        if (array.length < 1) {
            return null;
        }
        for (var item of array) {
            // The url has the query at the end
            if (item.endsWith(word))
                return item;
        }
        for (var item of array) {
            // The url has the query in lower case at the end
            if (item.endsWith(word.toLowerCase()))
                return item;
        }
        return array[0];
    }
    try {
        // API search call
        var res = await node_fetch2_1.default("https://" + subdomain + ".wiktionary.org/w/api.php?action=opensearch&format=json&formatversion=2&search=" + query + "&namespace=0&limit=3", {});
    }
    catch (error) {
        return { success: false, error: error, link: null };
    }
    var json = await res.json();
    // returned list of URLs
    var urls = json[3];
    if (urls.length > 0) {
        return { success: true, link: getMostFittingSearchResultURL(query, urls), error: null };
    }
    else {
        return { success: false, link: null, error: "The search did not return any fitting articles" };
    }
}
exports.getWiktionaryPage = getWiktionaryPage;
/**
 * Get the filtered textContent of an HTMLElement
 * @param element HTMLElement: The element whose text should be parsed
 */
function getElementFilteredText(element) {
    var text = "";
    for (var child of element.childNodes) {
        if (["A", "I", "B", "SPAN", "P"].includes(child?.tagName)) {
            text += child.textContent;
        }
        else if (child.nodeType == 3) {
            text += child.textContent;
        }
    }
    return text.replace(/\n/g, "");
}
exports.getElementFilteredText = getElementFilteredText;
/**
 * Parses the subsections of the MW-meaning section
 * @param subsectionName Subsection type
 * @param currentData Current data (passed recursively)
 * @param hostElement The subsection element
 */
function MWMeaningSectionParser(subsectionName, currentData, hostElement) {
    var internal_sub_name = subsectionName.toLowerCase();
    // Is it the headword?
    if (hostElement.querySelector(".headword")) {
        if (!currentData.meanings) {
            currentData.meanings = { preposition: null, proper_noun: null, adjective: null, adverb: null, verb: null, noun: null, conjunction: null, particle: null };
        }
        // @ts-ignore
        if (!currentData.meanings[internal_sub_name]) {
            // @ts-ignore
            currentData.meanings[internal_sub_name] = { head: hostElement.textContent.replace(/\n/g, ""), meanings: [] };
        }
    }
    else if (hostElement.tagName == "OL") {
        for (var li of hostElement.children) {
            var text = getElementFilteredText(li);
            var object = {};
            object.text = text;
            if (li.querySelector("dl")) {
                object.example = li.querySelector("dl").textContent;
            }
            // @ts-ignore
            currentData.meanings[internal_sub_name].meanings.push(object);
        }
    }
    // recursive
    return currentData;
}
exports.MWMeaningSectionParser = MWMeaningSectionParser;
/**
 * Identifies the normalized id of a subsection
 * @param subheaderId The id, e.g. Etymology_4
 * @returns The normalized id
 */
function identifySubsectionHeader(subheaderId) {
    const knownIDs = ["Proper_noun", "Noun", "Verb", "Adverb", "Adjective", "Conjunction", "Preposition", "Particle", "Alternative_forms", "Etymology", "Pronunciation"];
    if (knownIDs.includes(subheaderId))
        return subheaderId;
    for (var knownID of knownIDs) {
        if ((new RegExp(knownID + "_\\d{1,}")).test(subheaderId)) {
            return knownID;
        }
    }
    return null;
}
exports.identifySubsectionHeader = identifySubsectionHeader;
/**
 * Returns the collected data of a MW-Section
 * @param MWElement The MW head element
 * @param currentData The collected data
 * @param sectionName Name of the section
 * @param firstRun Recursive helper that indicates the first run
 * @returns Current Data
 */
function wikiMWElementParser(MWElement, currentData, sectionName = "None", firstRun = true) {
    if (MWElement == null)
        return currentData;
    if (!firstRun && MWElement.tagName == "H2" && MWElement?.children[0]?.classList.contains("mw-headline")) {
        // found new section
        return currentData;
    }
    firstRun = false;
    if (MWElement.tagName != "H3" && MWElement.tagName != "H4") {
        if (sectionName == "Alternative_forms") {
            currentData.alternatives = { raw: MWElement.textContent };
        }
        else if (sectionName == "Etymology") {
            if (!currentData.etymology)
                currentData.etymology = [];
            if (MWElement.textContent != "\n")
                currentData.etymology.push(MWElement.textContent.replace(/\n/g, ""));
        }
        else if (sectionName == "Pronunciation") {
            if (MWElement.tagName == "UL") {
                if (!currentData.pronunciation)
                    currentData.pronunciation = [];
                var lis = MWElement.children;
                for (var li of lis) {
                    // is rhyme info
                    if (li.textContent.startsWith("Rhymes")) {
                        currentData.rhymes = li.textContent;
                        continue;
                    }
                    // is audio
                    var object = { IPA: null, audio: null, type: null };
                    var ipa = li.querySelector(".IPA");
                    object.IPA = ipa ? ipa.textContent : null;
                    var audio = li.querySelector(".audiometa > a");
                    object.audio = audio ? new WikiMediaResource(audio.getAttribute("href")) : null;
                    var type = li.querySelector(".unicode");
                    object.type = type ? type.textContent : null;
                    currentData.pronunciation.push(object);
                }
            }
        }
        else if (["Proper_noun", "Noun", "Verb", "Adverb", "Adjective", "Conjunction", "Preposition", "Particle"].includes(sectionName)) {
            currentData = MWMeaningSectionParser(sectionName, currentData, MWElement);
        }
    }
    if (MWElement.classList.contains("thumb")) {
        // is image
        if (!currentData.images)
            currentData.images = [];
        var image = {
            url: new WikiMediaResource(MWElement.querySelector("a.image")?.getAttribute("href")),
            caption: MWElement.querySelector(".thumbcaption")?.textContent
        };
        currentData.images.push(image);
    }
    else if (MWElement.tagName == "H3" || MWElement.tagName == "H4") {
        sectionName = identifySubsectionHeader(MWElement.children[0].id) || "None";
    }
    // !recursion
    return wikiMWElementParser(MWElement.nextElementSibling, currentData, sectionName, firstRun);
}
exports.wikiMWElementParser = wikiMWElementParser;
/**
 * Returns the h2 element of the section specified by id or null if it does not exist
 * @param id The id of the section, e.g. 'English' or 'German'
 * @param document The document element
 * @returns h2-HTMLElement | null
 */
function docGetLangSectionByTitle(id, document) {
    var h2s = document.querySelectorAll(".mw-parser-output h2");
    for (var h2 of h2s) {
        if (h2.children) {
            if (h2.children.length > 0)
                if (h2.children[0].id == id) {
                    return h2;
                }
        }
    }
    return null;
}
exports.docGetLangSectionByTitle = docGetLangSectionByTitle;
/**
 * Returns the parsed data of a specified wiki page specified by URL
 * @param wiki_url The URL of the page to be parsed
 * @returns Parsed data
 */
async function parseWiki(wiki_url, languageId) {
    try {
        var res = await node_fetch2_1.default(wiki_url, {});
    }
    catch (error) {
        return { error: "Network error", rhymes: null, url: wiki_url, alternatives: null, etymology: null, pronunciation: null, images: null, meanings: null };
    }
    var html = await res.text();
    var doc = (new jsdom_1.JSDOM(html)).window.document;
    var mw_lang_head = docGetLangSectionByTitle(languageId, doc);
    if (!mw_lang_head) {
        return { alternatives: null, url: wiki_url, etymology: null, images: null, pronunciation: null, rhymes: null, meanings: null, error: "Could not find language section " + languageId };
    }
    var d = { error: null, url: null, alternatives: null, etymology: null, pronunciation: null, rhymes: null, images: null, meanings: null };
    d.url = wiki_url;
    var data = wikiMWElementParser(mw_lang_head, d);
    if (!data.alternatives)
        data.alternatives = null;
    if (!data.error)
        data.error = null;
    if (!data.etymology)
        data.etymology = null;
    if (!data.images)
        data.images = null;
    if (!data.pronunciation)
        data.pronunciation = null;
    if (!data.rhymes)
        data.rhymes = null;
    return data;
}
exports.parseWiki = parseWiki;
/**
 * Main class: Wiktionary Scraper
 */
class WiktionaryScraper {
    /**
     *
     * @param _subdomain The wiktionary subdomain, e.g. 'en' for en.wiktionary.org
     */
    constructor(_subdomain = "en") {
        this.subdomain = _subdomain;
    }
    /**
     * Searches wiktionary and returns the parsed content of the most fitting search result page
     * @param query The query for the search
     * @returns Parsed data or error object
     */
    async fetchData(query, languageId = "English") {
        var page = await getWiktionaryPage(query, this.subdomain);
        if (page.success) {
            var data = await parseWiki(page.link, languageId);
            return data;
        }
        else {
            return { error: page.error, etymology: null, pronunciation: null, url: null, alternatives: null, rhymes: null, images: null, meanings: null };
        }
    }
}
exports.default = WiktionaryScraper;