"use strict"; /** * @author Zerobyte * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.parseWiki = exports.docGetLangSectionByTitle = exports.wikiMWElementParser = exports.identifySubsectionHeader = exports.MWMeaningSectionParser = exports.getElementFilteredText = exports.getWiktionaryPage = exports.WikiMediaResource = void 0; const node_fetch2_1 = require("node-fetch2"); const jsdom_1 = require("jsdom"); class WikiMediaResource extends String { constructor(str = "") { super(str); this.raw = str; } async getFullMedia() { // valid URL? try { new URL("https://en.wiktionary.org/" + this.raw); } catch (e) { return null; } // try fetching var res; try { res = await node_fetch2_1.default("https://en.wiktionary.org/" + this.raw); } catch (e) { return null; } var html = await res.text(); var doc = new jsdom_1.JSDOM(html).window.document; var url = doc.querySelector(".fullMedia .internal")?.getAttribute("href"); if (!url?.startsWith("https")) { url = url?.replace("//", "https://"); } return url; } } exports.WikiMediaResource = WikiMediaResource; /** * Search for wiktionary pages * @param query The search query * @param subdomain The wiktionary subdomain * @returns Link to a fitting wiktionary page */ async function getWiktionaryPage(query, subdomain = "en") { /** * * @param word The search query * @param array An array of URLs * @returns The most likely best fitting URL; Returns null if the array has a length below 1 */ function getMostFittingSearchResultURL(word, array) { if (array.length < 1) { return null; } for (var item of array) { // The url has the query at the end if (item.endsWith(word)) return item; } for (var item of array) { // The url has the query in lower case at the end if (item.endsWith(word.toLowerCase())) return item; } return array[0]; } try { // API search call var res = await node_fetch2_1.default("https://" + subdomain + ".wiktionary.org/w/api.php?action=opensearch&format=json&formatversion=2&search=" + query + "&namespace=0&limit=3", {}); } catch (error) { return { success: false, error: error, link: null }; } var json = await res.json(); // returned list of URLs var urls = json[3]; if (urls.length > 0) { return { success: true, link: getMostFittingSearchResultURL(query, urls), error: null }; } else { return { success: false, link: null, error: "The search did not return any fitting articles" }; } } exports.getWiktionaryPage = getWiktionaryPage; /** * Get the filtered textContent of an HTMLElement * @param element HTMLElement: The element whose text should be parsed */ function getElementFilteredText(element) { var text = ""; for (var child of element.childNodes) { if (["A", "I", "B", "SPAN", "P"].includes(child?.tagName)) { text += child.textContent; } else if (child.nodeType == 3) { text += child.textContent; } } return text.replace(/\n/g, ""); } exports.getElementFilteredText = getElementFilteredText; /** * Parses the subsections of the MW-meaning section * @param subsectionName Subsection type * @param currentData Current data (passed recursively) * @param hostElement The subsection element */ function MWMeaningSectionParser(subsectionName, currentData, hostElement) { var internal_sub_name = subsectionName.toLowerCase(); // Is it the headword? if (hostElement.querySelector(".headword")) { if (!currentData.meanings) { currentData.meanings = { preposition: null, proper_noun: null, adjective: null, adverb: null, verb: null, noun: null, conjunction: null, particle: null }; } // @ts-ignore if (!currentData.meanings[internal_sub_name]) { // @ts-ignore currentData.meanings[internal_sub_name] = { head: hostElement.textContent.replace(/\n/g, ""), meanings: [] }; } } else if (hostElement.tagName == "OL") { for (var li of hostElement.children) { var text = getElementFilteredText(li); var object = {}; object.text = text; if (li.querySelector("dl")) { object.example = li.querySelector("dl").textContent; } // @ts-ignore currentData.meanings[internal_sub_name].meanings.push(object); } } // recursive return currentData; } exports.MWMeaningSectionParser = MWMeaningSectionParser; /** * Identifies the normalized id of a subsection * @param subheaderId The id, e.g. Etymology_4 * @returns The normalized id */ function identifySubsectionHeader(subheaderId) { const knownIDs = ["Proper_noun", "Noun", "Verb", "Adverb", "Adjective", "Conjunction", "Preposition", "Particle", "Alternative_forms", "Etymology", "Pronunciation"]; if (knownIDs.includes(subheaderId)) return subheaderId; for (var knownID of knownIDs) { if ((new RegExp(knownID + "_\\d{1,}")).test(subheaderId)) { return knownID; } } return null; } exports.identifySubsectionHeader = identifySubsectionHeader; /** * Returns the collected data of a MW-Section * @param MWElement The MW head element * @param currentData The collected data * @param sectionName Name of the section * @param firstRun Recursive helper that indicates the first run * @returns Current Data */ function wikiMWElementParser(MWElement, currentData, sectionName = "None", firstRun = true) { if (MWElement == null) return currentData; if (!firstRun && MWElement.tagName == "H2" && MWElement?.children[0]?.classList.contains("mw-headline")) { // found new section return currentData; } firstRun = false; if (MWElement.tagName != "H3" && MWElement.tagName != "H4") { if (sectionName == "Alternative_forms") { currentData.alternatives = { raw: MWElement.textContent }; } else if (sectionName == "Etymology") { if (!currentData.etymology) currentData.etymology = []; if (MWElement.textContent != "\n") currentData.etymology.push(MWElement.textContent.replace(/\n/g, "")); } else if (sectionName == "Pronunciation") { if (MWElement.tagName == "UL") { if (!currentData.pronunciation) currentData.pronunciation = []; var lis = MWElement.children; for (var li of lis) { // is rhyme info if (li.textContent.startsWith("Rhymes")) { currentData.rhymes = li.textContent; continue; } // is audio var object = { IPA: null, audio: null, type: null }; var ipa = li.querySelector(".IPA"); object.IPA = ipa ? ipa.textContent : null; var audio = li.querySelector(".audiometa > a"); object.audio = audio ? new WikiMediaResource(audio.getAttribute("href")) : null; var type = li.querySelector(".unicode"); object.type = type ? type.textContent : null; currentData.pronunciation.push(object); } } } else if (["Proper_noun", "Noun", "Verb", "Adverb", "Adjective", "Conjunction", "Preposition", "Particle"].includes(sectionName)) { currentData = MWMeaningSectionParser(sectionName, currentData, MWElement); } } if (MWElement.classList.contains("thumb")) { // is image if (!currentData.images) currentData.images = []; var image = { url: new WikiMediaResource(MWElement.querySelector("a.image")?.getAttribute("href")), caption: MWElement.querySelector(".thumbcaption")?.textContent }; currentData.images.push(image); } else if (MWElement.tagName == "H3" || MWElement.tagName == "H4") { sectionName = identifySubsectionHeader(MWElement.children[0].id) || "None"; } // !recursion return wikiMWElementParser(MWElement.nextElementSibling, currentData, sectionName, firstRun); } exports.wikiMWElementParser = wikiMWElementParser; /** * Returns the h2 element of the section specified by id or null if it does not exist * @param id The id of the section, e.g. 'English' or 'German' * @param document The document element * @returns h2-HTMLElement | null */ function docGetLangSectionByTitle(id, document) { var h2s = document.querySelectorAll(".mw-parser-output h2"); for (var h2 of h2s) { if (h2.children) { if (h2.children.length > 0) if (h2.children[0].id == id) { return h2; } } } return null; } exports.docGetLangSectionByTitle = docGetLangSectionByTitle; /** * Returns the parsed data of a specified wiki page specified by URL * @param wiki_url The URL of the page to be parsed * @returns Parsed data */ async function parseWiki(wiki_url, languageId) { try { var res = await node_fetch2_1.default(wiki_url, {}); } catch (error) { return { error: "Network error", rhymes: null, url: wiki_url, alternatives: null, etymology: null, pronunciation: null, images: null, meanings: null }; } var html = await res.text(); var doc = (new jsdom_1.JSDOM(html)).window.document; var mw_lang_head = docGetLangSectionByTitle(languageId, doc); if (!mw_lang_head) { return { alternatives: null, url: wiki_url, etymology: null, images: null, pronunciation: null, rhymes: null, meanings: null, error: "Could not find language section " + languageId }; } var d = { error: null, url: null, alternatives: null, etymology: null, pronunciation: null, rhymes: null, images: null, meanings: null }; d.url = wiki_url; var data = wikiMWElementParser(mw_lang_head, d); if (!data.alternatives) data.alternatives = null; if (!data.error) data.error = null; if (!data.etymology) data.etymology = null; if (!data.images) data.images = null; if (!data.pronunciation) data.pronunciation = null; if (!data.rhymes) data.rhymes = null; return data; } exports.parseWiki = parseWiki; /** * Main class: Wiktionary Scraper */ class WiktionaryScraper { /** * * @param _subdomain The wiktionary subdomain, e.g. 'en' for en.wiktionary.org */ constructor(_subdomain = "en") { this.subdomain = _subdomain; } /** * Searches wiktionary and returns the parsed content of the most fitting search result page * @param query The query for the search * @returns Parsed data or error object */ async fetchData(query, languageId = "English") { var page = await getWiktionaryPage(query, this.subdomain); if (page.success) { var data = await parseWiki(page.link, languageId); return data; } else { return { error: page.error, etymology: null, pronunciation: null, url: null, alternatives: null, rhymes: null, images: null, meanings: null }; } } } exports.default = WiktionaryScraper;