1 | "use strict";
|
2 | Object.defineProperty(exports, "__esModule", { value: true });
|
3 | exports.createRequestOptions = exports.filterRequestsByPatterns = exports.createRequests = exports.constructRegExpObjectsFromRegExps = exports.validateGlobPattern = exports.constructGlobObjectsFromGlobs = exports.constructRegExpObjectsFromPseudoUrls = exports.updateEnqueueLinksPatternCache = void 0;
|
4 | const tslib_1 = require("tslib");
|
5 | const url_1 = require("url");
|
6 | const pseudo_url_1 = require("@apify/pseudo_url");
|
7 | const minimatch_1 = tslib_1.__importDefault(require("minimatch"));
|
8 | const request_1 = require("../request");
|
9 | const MAX_ENQUEUE_LINKS_CACHE_SIZE = 1000;
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 | const enqueueLinksPatternCache = new Map();
|
17 |
|
18 |
|
19 |
|
20 | function updateEnqueueLinksPatternCache(item, pattern) {
|
21 | enqueueLinksPatternCache.set(item, pattern);
|
22 | if (enqueueLinksPatternCache.size > MAX_ENQUEUE_LINKS_CACHE_SIZE) {
|
23 | const key = enqueueLinksPatternCache.keys().next().value;
|
24 | enqueueLinksPatternCache.delete(key);
|
25 | }
|
26 | }
|
27 | exports.updateEnqueueLinksPatternCache = updateEnqueueLinksPatternCache;
|
28 |
|
29 |
|
30 |
|
31 |
|
32 |
|
33 | function constructRegExpObjectsFromPseudoUrls(pseudoUrls) {
|
34 | return pseudoUrls.map((item) => {
|
35 |
|
36 | let regexpObject = enqueueLinksPatternCache.get(item);
|
37 | if (regexpObject)
|
38 | return regexpObject;
|
39 | if (typeof item === 'string') {
|
40 | regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(item) };
|
41 | }
|
42 | else {
|
43 | const { purl, ...requestOptions } = item;
|
44 | regexpObject = { regexp: (0, pseudo_url_1.purlToRegExp)(purl), ...requestOptions };
|
45 | }
|
46 | updateEnqueueLinksPatternCache(item, regexpObject);
|
47 | return regexpObject;
|
48 | });
|
49 | }
|
50 | exports.constructRegExpObjectsFromPseudoUrls = constructRegExpObjectsFromPseudoUrls;
|
51 |
|
52 |
|
53 |
|
54 |
|
55 |
|
56 | function constructGlobObjectsFromGlobs(globs) {
|
57 | return globs.map((item) => {
|
58 |
|
59 | let globObject = enqueueLinksPatternCache.get(item);
|
60 | if (globObject)
|
61 | return globObject;
|
62 | if (typeof item === 'string') {
|
63 | globObject = { glob: validateGlobPattern(item) };
|
64 | }
|
65 | else {
|
66 | const { glob, ...requestOptions } = item;
|
67 | globObject = { glob: validateGlobPattern(glob), ...requestOptions };
|
68 | }
|
69 | updateEnqueueLinksPatternCache(item, globObject);
|
70 | return globObject;
|
71 | });
|
72 | }
|
73 | exports.constructGlobObjectsFromGlobs = constructGlobObjectsFromGlobs;
|
74 |
|
75 |
|
76 |
|
77 | function validateGlobPattern(glob) {
|
78 | const globTrimmed = glob.trim();
|
79 | if (globTrimmed.length === 0)
|
80 | throw new Error(`Cannot parse Glob pattern '${globTrimmed}': it must be an non-empty string`);
|
81 | return globTrimmed;
|
82 | }
|
83 | exports.validateGlobPattern = validateGlobPattern;
|
84 |
|
85 |
|
86 |
|
87 |
|
88 |
|
89 | function constructRegExpObjectsFromRegExps(regexps) {
|
90 | return regexps.map((item) => {
|
91 |
|
92 | let regexpObject = enqueueLinksPatternCache.get(item);
|
93 | if (regexpObject)
|
94 | return regexpObject;
|
95 | if (item instanceof RegExp) {
|
96 | regexpObject = { regexp: item };
|
97 | }
|
98 | else {
|
99 | regexpObject = item;
|
100 | }
|
101 | updateEnqueueLinksPatternCache(item, regexpObject);
|
102 | return regexpObject;
|
103 | });
|
104 | }
|
105 | exports.constructRegExpObjectsFromRegExps = constructRegExpObjectsFromRegExps;
|
106 |
|
107 |
|
108 |
|
109 | function createRequests(requestOptions, urlPatternObjects) {
|
110 | if (!urlPatternObjects || !urlPatternObjects.length) {
|
111 | return requestOptions
|
112 | .map((opts) => new request_1.Request(typeof opts === 'string' ? { url: opts } : opts));
|
113 | }
|
114 | const requests = [];
|
115 | for (const opts of requestOptions) {
|
116 | const urlToMatch = typeof opts === 'string' ? opts : opts.url;
|
117 | for (const urlPatternObject of urlPatternObjects) {
|
118 | const { regexp, glob, ...requestRegExpOptions } = urlPatternObject;
|
119 | if ((regexp && urlToMatch.match(regexp)) ||
|
120 | (glob && (0, minimatch_1.default)(urlToMatch, glob, { nocase: true }))) {
|
121 | const request = typeof opts === 'string'
|
122 | ? { url: opts, ...requestRegExpOptions }
|
123 | : { ...opts, ...requestRegExpOptions };
|
124 | requests.push(new request_1.Request(request));
|
125 |
|
126 | break;
|
127 | }
|
128 | }
|
129 | }
|
130 | return requests;
|
131 | }
|
132 | exports.createRequests = createRequests;
|
133 | function filterRequestsByPatterns(requests, patterns) {
|
134 | if (!patterns?.length) {
|
135 | return requests;
|
136 | }
|
137 | const filtered = [];
|
138 | for (const request of requests) {
|
139 | for (const urlPatternObject of patterns) {
|
140 | const { regexp, glob } = urlPatternObject;
|
141 | if ((regexp && request.url.match(regexp)) ||
|
142 | (glob && (0, minimatch_1.default)(request.url, glob, { nocase: true }))) {
|
143 | filtered.push(request);
|
144 |
|
145 | break;
|
146 | }
|
147 | }
|
148 | }
|
149 | return filtered;
|
150 | }
|
151 | exports.filterRequestsByPatterns = filterRequestsByPatterns;
|
152 |
|
153 |
|
154 |
|
155 | function createRequestOptions(sources, options = {}) {
|
156 | return sources
|
157 | .map((src) => (typeof src === 'string' ? { url: src } : src))
|
158 | .filter(({ url }) => {
|
159 | try {
|
160 | return new url_1.URL(url).href;
|
161 | }
|
162 | catch (err) {
|
163 | return false;
|
164 | }
|
165 | })
|
166 | .map((requestOptions) => {
|
167 | var _a;
|
168 | requestOptions.userData ?? (requestOptions.userData = options.userData ?? {});
|
169 | (_a = requestOptions.userData).label ?? (_a.label = options.label);
|
170 | return requestOptions;
|
171 | });
|
172 | }
|
173 | exports.createRequestOptions = createRequestOptions;
|
174 |
|
\ | No newline at end of file |