UNPKG

6.69 kBJavaScriptView Raw
1"use strict";
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.createRequestOptions = exports.filterRequestsByPatterns = exports.createRequests = exports.constructRegExpObjectsFromRegExps = exports.validateGlobPattern = exports.constructGlobObjectsFromGlobs = exports.constructRegExpObjectsFromPseudoUrls = exports.updateEnqueueLinksPatternCache = void 0;
4const tslib_1 = require("tslib");
5const url_1 = require("url");
6const pseudo_url_1 = require("@apify/pseudo_url");
7const minimatch_1 = tslib_1.__importDefault(require("minimatch"));
8const request_1 = require("../request");
9const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000;
10/**
11 * To enable direct use of the Actor UI `globs`/`regexps`/`pseudoUrls` output while keeping high performance,
12 * all the regexps from the output are only constructed once and kept in a cache
13 * by the `enqueueLinks()` function.
14 * @ignore
15 */
16const enqueueLinksPatternCache = new Map();
17/**
18 * @ignore
19 */
20function updateEnqueueLinksPatternCache(item, pattern) {
21 enqueueLinksPatternCache.set(item, pattern);
22 if (enqueueLinksPatternCache.size > MAX_ENQUEUE_LINKS_CACHE_SIZE) {
23 const key = enqueueLinksPatternCache.keys().next().value;
24 enqueueLinksPatternCache.delete(key);
25 }
26}
27exports.updateEnqueueLinksPatternCache = updateEnqueueLinksPatternCache;
28/**
29 * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
30 * to construct RegExps from PseudoUrl strings.
31 * @ignore
32 */
33function constructRegExpObjectsFromPseudoUrls(pseudoUrls) {
34 return pseudoUrls.map((item) => {
35 // Get pseudoUrl object from cache.
36 let regexpObject = enqueueLinksPatternCache.get(item);
37 if (regexpObject)
38 return regexpObject;
39 if (typeof item === 'string') {
40 regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(item) };
41 }
42 else {
43 const { purl, ...requestOptions } = item;
44 regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(purl), ...requestOptions };
45 }
46 updateEnqueueLinksPatternCache(item, regexpObject);
47 return regexpObject;
48 });
49}
50exports.constructRegExpObjectsFromPseudoUrls = constructRegExpObjectsFromPseudoUrls;
51/**
52 * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
53 * to construct Glob objects from Glob pattern strings.
54 * @ignore
55 */
56function constructGlobObjectsFromGlobs(globs) {
57 return globs.map((item) => {
58 // Get glob object from cache.
59 let globObject = enqueueLinksPatternCache.get(item);
60 if (globObject)
61 return globObject;
62 if (typeof item === 'string') {
63 globObject = { glob: validateGlobPattern(item) };
64 }
65 else {
66 const { glob, ...requestOptions } = item;
67 globObject = { glob: validateGlobPattern(glob), ...requestOptions };
68 }
69 updateEnqueueLinksPatternCache(item, globObject);
70 return globObject;
71 });
72}
73exports.constructGlobObjectsFromGlobs = constructGlobObjectsFromGlobs;
74/**
75 * @internal
76 */
77function validateGlobPattern(glob) {
78 const globTrimmed = glob.trim();
79 if (globTrimmed.length === 0)
80 throw new Error(`Cannot parse Glob pattern '${globTrimmed}': it must be an non-empty string`);
81 return globTrimmed;
82}
83exports.validateGlobPattern = validateGlobPattern;
84/**
85 * Helper factory used in the `enqueueLinks()` and enqueueLinksByClickingElements() function
86 * to check RegExps input and return valid RegExps.
87 * @ignore
88 */
89function constructRegExpObjectsFromRegExps(regexps) {
90 return regexps.map((item) => {
91 // Get regexp object from cache.
92 let regexpObject = enqueueLinksPatternCache.get(item);
93 if (regexpObject)
94 return regexpObject;
95 if (item instanceof RegExp) {
96 regexpObject = { regexp: item };
97 }
98 else {
99 regexpObject = item;
100 }
101 updateEnqueueLinksPatternCache(item, regexpObject);
102 return regexpObject;
103 });
104}
105exports.constructRegExpObjectsFromRegExps = constructRegExpObjectsFromRegExps;
106/**
107 * @ignore
108 */
109function createRequests(requestOptions, urlPatternObjects) {
110 if (!urlPatternObjects || !urlPatternObjects.length) {
111 return requestOptions
112 .map((opts) => new request_1.Request(typeof opts === 'string' ? { url: opts } : opts));
113 }
114 const requests = [];
115 for (const opts of requestOptions) {
116 const urlToMatch = typeof opts === 'string' ? opts : opts.url;
117 for (const urlPatternObject of urlPatternObjects) {
118 const { regexp, glob, ...requestRegExpOptions } = urlPatternObject;
119 if ((regexp && urlToMatch.match(regexp)) || // eslint-disable-line
120 (glob && (0, minimatch_1.default)(urlToMatch, glob, { nocase: true }))) {
121 const request = typeof opts === 'string'
122 ? { url: opts, ...requestRegExpOptions }
123 : { ...opts, ...requestRegExpOptions };
124 requests.push(new request_1.Request(request));
125 // Stop checking other patterns for this request option as it was already matched
126 break;
127 }
128 }
129 }
130 return requests;
131}
132exports.createRequests = createRequests;
133function filterRequestsByPatterns(requests, patterns) {
134 if (!patterns?.length) {
135 return requests;
136 }
137 const filtered = [];
138 for (const request of requests) {
139 for (const urlPatternObject of patterns) {
140 const { regexp, glob } = urlPatternObject;
141 if ((regexp && request.url.match(regexp)) || // eslint-disable-line
142 (glob && (0, minimatch_1.default)(request.url, glob, { nocase: true }))) {
143 filtered.push(request);
144 // Break the pattern loop, as we already matched this request once
145 break;
146 }
147 }
148 }
149 return filtered;
150}
151exports.filterRequestsByPatterns = filterRequestsByPatterns;
152/**
153 * @ignore
154 */
155function createRequestOptions(sources, options = {}) {
156 return sources
157 .map((src) => (typeof src === 'string' ? { url: src } : src))
158 .filter(({ url }) => {
159 try {
160 return new url_1.URL(url).href;
161 }
162 catch (err) {
163 return false;
164 }
165 })
166 .map((requestOptions) => {
167 var _a;
168 requestOptions.userData ?? (requestOptions.userData = options.userData ?? {});
169 (_a = requestOptions.userData).label ?? (_a.label = options.label);
170 return requestOptions;
171 });
172}
173exports.createRequestOptions = createRequestOptions;
174//# sourceMappingURL=shared.js.map
\No newline at end of file