"use strict"; import * as url from "url"; import * as querystring from "querystring"; let decodeHtmlLinks = false; //const knownDomain = require(global.__legacyModules ? '../_db/_mongodb/models/knownDomain' : '../../models/knownDomain') //const knownDomain = require('staydown-models').knownDomain let knownDomain = null; //const utils = global.__legacyModules ? require('strictutils')() : require('@ekliptor/apputils') import utils = require('./utils') //const nconf = utils.nconf; import entities = require("entities") const MAX_LINKS_PER_PAGE = 50000 // per page per type const SKIP_EMPTY_URL_PATH = true // don't crawl urls with path / let filteredParts = ['/ref/', 'ref=', 'referrer', '/aff/', 'aff=', 'affiliate', 'register', 'partner', 'signup', 'forumstatus.php'] let derefLink = (urlObject: url.Url, link) => { for (let deref of knownDomain.DEREFERER) { if (urlObject.hostname !== deref || urlObject.query.length === 0) continue if (typeof urlObject.query === "string") { // should always be string? link = utils.urlDecode(urlObject.query) break; } link = utils.urlDecode(querystring.stringify(urlObject.query)) break; } let linkLower = link.toLowerCase() if (linkLower.indexOf('%2f') !== -1 || linkLower.indexOf('%3f') !== -1 || linkLower.indexOf('%20') !== -1) // / or ? or space link = utils.urlDecode(link) if (link.indexOf("?") === 0) link = link.substr(1); return link } let isFilteredLink = (link) => { if (link.match(/favicon\.ico$/i) !== null) return true let linkLower = link.toLowerCase() for (let i = 0; i < filteredParts.length; i++) { if (linkLower.indexOf(filteredParts[i]) !== -1) return true } return false } let isIgnoredLink = (urlObj: url.Url) => { return knownDomain.IGNORE_LIST.has(urlObj.hostname); } let getLinkForMarker = (html, htmlLower, domain, startTxt, filterSpamLinks = true) => { let links = [] let max = htmlLower.length let start = 0 let linkProtocol = startTxt.indexOf('http') !== 0 && startTxt.indexOf(':') === -1 ? 'http://' : '' // if startTxt is a protocol add nothing, otherwise assume http let domainEnding = domain.length !== 0 && domain.match(/\.[a-z0-9]+$/i) === null ? '.' : '' //let removeBackslash = startTxt.indexOf('\\') !== -1 // they get extracted with "www." marker too let count = 0 while ((start = htmlLower.indexOf(startTxt + domain + domainEnding, start)) !== -1) { count++ if (count > MAX_LINKS_PER_PAGE) break let curStart = start // cache it start += domain.length + startTxt.length + 1 let ends: number[] = [] ends.push(htmlLower.indexOf("\n", start)) // real line ending ends.push(htmlLower.indexOf("\\n", start)) // escaped line ending in javascript strings with links ends.push(htmlLower.indexOf("\r", start)) // repeat the same for windows ends.push(htmlLower.indexOf("\\r", start)) ends.push(htmlLower.indexOf("<", start)) ends.push(htmlLower.indexOf(">", start)) // value=https://www.oboom.com/IS3JZCPX/asex.wmv> is valid html (without quotes) ends.push(htmlLower.indexOf(" ", start)) ends.push(htmlLower.indexOf('"', start)) // TODO http://www.pornmade.com/riley-reid-in-riley-really-has-fun-twistys/ html attributes without quotes are valid! ends.push(htmlLower.indexOf("'", start)) ends.push(htmlLower.indexOf("[/", start)) // the end of BB tags ends = ends.map((end) => { return end === -1 || end <= start ? max : end }) let end = ends.mathMin() let link = linkProtocol + html.substring(curStart, end) // get it from the original html (preserve upper/lowercase) //if (removeBackslash) if (link.indexOf('\\') !== -1) link = link.replace(/\\/g, '') //link = entities.decodeHTML(link).replace(/ *\\r$/, '').trim() // doesn't always work. why? link = entities.decodeHTML(link).trim() start += domain.length + 1 let urlObj = utils.parseUrl(link) if (!urlObj || !urlObj.host) continue else if (SKIP_EMPTY_URL_PATH && (!urlObj.pathname || urlObj.pathname === "/") && !urlObj.query) continue // deref is often not called because the url dereferer is not part of our known domains, i.e. startPos is in the query string already link = derefLink(urlObj, link) if (!link) // check again continue; if (filterSpamLinks === true && isFilteredLink(link)) continue else if (isIgnoredLink(urlObj)) continue links.push(link) } return links } let getExternalLinks = (html, domain) => { let links = [] if (decodeHtmlLinks === true) html = utils.decodeHtml(html, false); let htmlLower = html.toLocaleLowerCase() if (htmlLower.length !== html.length) html = htmlLower // should never happen links = links.concat(getLinkForMarker(html, htmlLower, domain, 'http://')) links = links.concat(getLinkForMarker(html, htmlLower, domain, 'https://')) links = links.concat(getLinkForMarker(html, htmlLower, domain, 'http:\\/\\/')) links = links.concat(getLinkForMarker(html, htmlLower, domain, 'https:\\/\\/')) links = links.concat(getLinkForMarker(html, htmlLower, domain, 'www.')) return links } /** * Enables decoding of HTML strings for LinkExtractor to find links such as href='ht...' * @param decode */ export function setDecodeHtml(decode: boolean) { decodeHtmlLinks = decode; } export function setKnownDomains(knownDomainModule) { const requireSet = ["DEREFERER", "DOMAINS", "CRYPTERS", "IGNORE_LIST"] const requiredFun = ["isHoster", "isCrypter"] for (let required of requireSet) { if (!knownDomainModule[required] || knownDomainModule[required] instanceof Set === false) throw new Error("Invalid knownDomainModule Module. The followng Arrays must be present: " + requireSet.toString()) } for (let required of requiredFun) { if (typeof knownDomainModule[required] !== "function") throw new Error("Invalid knownDomainModule Module. The followng functions properties must be present: " + requiredFun.toString()) } knownDomain = knownDomainModule } export function setFilteredUrlParts(filteredArr) { filteredParts = filteredArr } export function extractAllLinks(html, cb = null) { let links = getExternalLinks(html, "") links = utils.getUniqueUrls(links) cb && cb(null, links) } export function extractHosterLinks(html, decrypt = false, cb = null) { // TODO if decrypt is set to true: // 1. search also for domain names without extension // 2. open those links and check if we find and decrypt a known hoster domain if (decrypt === true) throw "extractHosterLinks() with decrypt option is not yet implemented" let links = [] knownDomain.DOMAINS.forEach((domain) => { if (knownDomain.isHoster(domain)) links = links.concat(getExternalLinks(html, domain)) }) knownDomain.DEREFERER.forEach((domain) => { links = links.concat(getExternalLinks(html, domain).filter(l => knownDomain.isHoster(utils.getRootHostname(l)))) }) //links = utils.uniqueArrayValues(links) links = utils.getUniqueUrls(links) cb && cb(null, links) } export function extractCrypterLinks(html, decrypt = false, cb = null) { if (decrypt === true) throw "extractCrypterLinks() with decrypt option is not yet implemented" let links = [] knownDomain.CRYPTERS.forEach((domain) => { links = links.concat(getExternalLinks(html, domain)) }) knownDomain.DEREFERER.forEach((domain) => { links = links.concat(getExternalLinks(html, domain).filter(l => knownDomain.isCrypter(utils.getRootHostname(l)))) }) links = utils.getUniqueUrls(links) // TODO decrypt async in here with new function extractor.decryptLinks() cb && cb(null, links) } export function exctractCustomLinks(html, protocol = 'http', domain = '', cb = null) { let htmlLower = html.toLocaleLowerCase() if (htmlLower.length !== html.length) html = htmlLower // should never happen if (protocol.substr(-3) !== '://' && protocol.substr(0, 7) !== 'magnet:') // can be ed2k:// or magnet:?xt=urn:btih: protocol += '://' let links = getLinkForMarker(html, htmlLower, domain, protocol, false) links = utils.getUniqueUrls(links) cb && cb(null, links) }