UNPKG

14.3 kBJavaScriptView Raw
1"use strict";
2Object.defineProperty(exports, "__esModule", { value: true });
3exports.Request = void 0;
4const tslib_1 = require("tslib");
5const utilities_1 = require("@apify/utilities");
6const node_crypto_1 = tslib_1.__importDefault(require("node:crypto"));
7const ow_1 = tslib_1.__importStar(require("ow"));
8const node_util_1 = tslib_1.__importDefault(require("node:util"));
9const log_1 = require("./log");
10const typedefs_1 = require("./typedefs");
11// new properties on the Request object breaks serialization
12const log = log_1.log.child({ prefix: 'Request' });
13const requestOptionalPredicates = {
14 id: ow_1.default.optional.string,
15 loadedUrl: ow_1.default.optional.string.url,
16 uniqueKey: ow_1.default.optional.string,
17 method: ow_1.default.optional.string,
18 payload: ow_1.default.optional.any(ow_1.default.string, ow_1.default.buffer),
19 noRetry: ow_1.default.optional.boolean,
20 retryCount: ow_1.default.optional.number,
21 errorMessages: ow_1.default.optional.array.ofType(ow_1.default.string),
22 headers: ow_1.default.optional.object,
23 userData: ow_1.default.optional.object,
24 label: ow_1.default.optional.string,
25 handledAt: ow_1.default.optional.any(ow_1.default.string.date, ow_1.default.date),
26 keepUrlFragment: ow_1.default.optional.boolean,
27 useExtendedUniqueKey: ow_1.default.optional.boolean,
28 skipNavigation: ow_1.default.optional.boolean,
29};
30/**
31 * Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata.
32 * The `Request` object also stores information about errors that occurred during processing of the request.
33 *
34 * Each `Request` instance has the `uniqueKey` property, which can be either specified
35 * manually in the constructor or generated automatically from the URL. Two requests with the same `uniqueKey`
36 * are considered as pointing to the same web resource. This behavior applies to all Crawlee classes,
37 * such as {@apilink RequestList}, {@apilink RequestQueue}, {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler}.
38 *
39 * Example use:
40 *
41 * ```javascript
42 * const request = new Request({
43 * url: 'http://www.example.com',
44 * headers: { Accept: 'application/json' },
45 * });
46 *
47 * ...
48 *
49 * request.userData.foo = 'bar';
50 * request.pushErrorMessage(new Error('Request failed!'));
51 *
52 * ...
53 *
54 * const foo = request.userData.foo;
55 * ```
56 * @category Sources
57 */
58class Request {
59 /**
60 * `Request` parameters including the URL, HTTP method and headers, and others.
61 */
62 constructor(options) {
63 /** Request ID */
64 Object.defineProperty(this, "id", {
65 enumerable: true,
66 configurable: true,
67 writable: true,
68 value: void 0
69 });
70 /** URL of the web page to crawl. */
71 Object.defineProperty(this, "url", {
72 enumerable: true,
73 configurable: true,
74 writable: true,
75 value: void 0
76 });
77 /**
78 * An actually loaded URL after redirects, if present. HTTP redirects are guaranteed
79 * to be included.
80 *
81 * When using {@apilink PuppeteerCrawler} or {@apilink PlaywrightCrawler}, meta tag and JavaScript redirects may,
82 * or may not be included, depending on their nature. This generally means that redirects,
83 * which happen immediately will most likely be included, but delayed redirects will not.
84 */
85 Object.defineProperty(this, "loadedUrl", {
86 enumerable: true,
87 configurable: true,
88 writable: true,
89 value: void 0
90 });
91 /**
92 * A unique key identifying the request.
93 * Two requests with the same `uniqueKey` are considered as pointing to the same URL.
94 */
95 Object.defineProperty(this, "uniqueKey", {
96 enumerable: true,
97 configurable: true,
98 writable: true,
99 value: void 0
100 });
101 /** HTTP method, e.g. `GET` or `POST`. */
102 Object.defineProperty(this, "method", {
103 enumerable: true,
104 configurable: true,
105 writable: true,
106 value: void 0
107 });
108 /** HTTP request payload, e.g. for POST requests. */
109 Object.defineProperty(this, "payload", {
110 enumerable: true,
111 configurable: true,
112 writable: true,
113 value: void 0
114 });
115 /** The `true` value indicates that the request will not be automatically retried on error. */
116 Object.defineProperty(this, "noRetry", {
117 enumerable: true,
118 configurable: true,
119 writable: true,
120 value: void 0
121 });
122 /** Indicates the number of times the crawling of the request has been retried on error. */
123 Object.defineProperty(this, "retryCount", {
124 enumerable: true,
125 configurable: true,
126 writable: true,
127 value: void 0
128 });
129 /** An array of error messages from request processing. */
130 Object.defineProperty(this, "errorMessages", {
131 enumerable: true,
132 configurable: true,
133 writable: true,
134 value: void 0
135 });
136 /** Object with HTTP headers. Key is header name, value is the value. */
137 Object.defineProperty(this, "headers", {
138 enumerable: true,
139 configurable: true,
140 writable: true,
141 value: void 0
142 });
143 /** Private store for the custom user data assigned to the request. */
144 Object.defineProperty(this, "_userData", {
145 enumerable: true,
146 configurable: true,
147 writable: true,
148 value: {}
149 });
150 /** Custom user data assigned to the request. */
151 Object.defineProperty(this, "userData", {
152 enumerable: true,
153 configurable: true,
154 writable: true,
155 value: {}
156 });
157 /**
158 * ISO datetime string that indicates the time when the request has been processed.
159 * Is `null` if the request has not been crawled yet.
160 */
161 Object.defineProperty(this, "handledAt", {
162 enumerable: true,
163 configurable: true,
164 writable: true,
165 value: void 0
166 });
167 (0, ow_1.default)(options, 'RequestOptions', ow_1.default.object);
168 (0, ow_1.default)(options.url, 'RequestOptions.url', ow_1.default.string);
169 // 'ow' validation is slow, because it checks all predicates
170 // even if the validated object has only 1 property.
171 // This custom validation loop iterates only over existing
172 // properties and speeds up the validation cca 3-fold.
173 // See https://github.com/sindresorhus/ow/issues/193
174 (0, typedefs_1.keys)(options).forEach((prop) => {
175 const predicate = requestOptionalPredicates[prop];
176 const value = options[prop];
177 if (predicate) {
178 (0, ow_1.default)(value, `RequestOptions.${prop}`, predicate);
179 // 'url' is checked above because it's not optional
180 }
181 else if (prop !== 'url') {
182 const msg = `Did not expect property \`${prop}\` to exist, got \`${value}\` in object \`RequestOptions\``;
183 throw new ow_1.ArgumentError(msg, this.constructor);
184 }
185 });
186 const { id, url, loadedUrl, uniqueKey, payload, noRetry = false, retryCount = 0, errorMessages = [], headers = {}, userData = {}, label, handledAt, keepUrlFragment = false, useExtendedUniqueKey = false, skipNavigation, } = options;
187 let { method = 'GET', } = options;
188 method = method.toUpperCase();
189 if (method === 'GET' && payload)
190 throw new Error('Request with GET method cannot have a payload.');
191 this.id = id;
192 this.url = url;
193 this.loadedUrl = loadedUrl;
194 this.uniqueKey = uniqueKey || this._computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
195 this.method = method;
196 this.payload = payload;
197 this.noRetry = noRetry;
198 this.retryCount = retryCount;
199 this.errorMessages = [...errorMessages];
200 this.headers = { ...headers };
201 this.handledAt = handledAt instanceof Date ? handledAt.toISOString() : handledAt;
202 if (label) {
203 userData.label = label;
204 }
205 Object.defineProperties(this, {
206 _userData: {
207 value: { __crawlee: {}, ...userData },
208 enumerable: false,
209 writable: true,
210 },
211 userData: {
212 get: () => this._userData,
213 set: (value) => {
214 Object.defineProperties(value, {
215 __crawlee: {
216 // eslint-disable-next-line no-underscore-dangle
217 value: this._userData.__crawlee,
218 enumerable: false,
219 writable: true,
220 },
221 toJSON: {
222 // eslint-disable-next-line no-underscore-dangle
223 value: () => {
224 if (Object.keys(this._userData.__crawlee).length > 0) {
225 return ({
226 ...this._userData,
227 __crawlee: this._userData.__crawlee,
228 });
229 }
230 return this._userData;
231 },
232 enumerable: false,
233 writable: true,
234 },
235 });
236 this._userData = value;
237 },
238 enumerable: true,
239 },
240 });
241 // reassign userData to ensure internal `__crawlee` object is non-enumerable
242 this.userData = userData;
243 if (skipNavigation != null)
244 this.skipNavigation = skipNavigation;
245 }
246 /** Tells the crawler processing this request to skip the navigation and process the request directly. */
247 get skipNavigation() {
248 // eslint-disable-next-line no-underscore-dangle
249 return this.userData.__crawlee?.skipNavigation ?? false;
250 }
251 set skipNavigation(value) {
252 // eslint-disable-next-line no-underscore-dangle
253 if (!this.userData.__crawlee)
254 this.userData.__crawlee = { skipNavigation: value };
255 // eslint-disable-next-line no-underscore-dangle
256 else
257 this.userData.__crawlee.skipNavigation = value;
258 }
259 /** shortcut for getting `request.userData.label` */
260 get label() {
261 return this.userData.label;
262 }
263 /** shortcut for setting `request.userData.label` */
264 set label(value) {
265 this.userData.label = value;
266 }
267 /**
268 * Stores information about an error that occurred during processing of this request.
269 *
270 * You should always use Error instances when throwing errors in JavaScript.
271 *
272 * Nevertheless, to improve the debugging experience when using third party libraries
273 * that may not always throw an Error instance, the function performs a type
274 * inspection of the passed argument and attempts to extract as much information
275 * as possible, since just throwing a bad type error makes any debugging rather difficult.
276 *
277 * @param errorOrMessage Error object or error message to be stored in the request.
278 * @param [options]
279 */
280 pushErrorMessage(errorOrMessage, options = {}) {
281 const { omitStack } = options;
282 let message;
283 const type = typeof errorOrMessage;
284 if (type === 'object') {
285 if (!errorOrMessage) {
286 message = 'null';
287 }
288 else if (errorOrMessage instanceof Error) {
289 message = omitStack
290 ? errorOrMessage.message
291 // .stack includes the message
292 : errorOrMessage.stack;
293 }
294 else if (Reflect.has(Object(errorOrMessage), 'message')) {
295 message = Reflect.get(Object(errorOrMessage), 'message');
296 }
297 else if (errorOrMessage.toString() !== '[object Object]') {
298 message = errorOrMessage.toString();
299 }
300 else {
301 try {
302 message = node_util_1.default.inspect(errorOrMessage);
303 }
304 catch (err) {
305 message = 'Unable to extract any message from the received object.';
306 }
307 }
308 }
309 else if (type === 'undefined') {
310 message = 'undefined';
311 }
312 else {
313 message = errorOrMessage.toString();
314 }
315 this.errorMessages.push(message);
316 }
317 _computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }) {
318 const normalizedMethod = method.toUpperCase();
319 const normalizedUrl = (0, utilities_1.normalizeUrl)(url, keepUrlFragment) || url; // It returns null when url is invalid, causing weird errors.
320 if (!useExtendedUniqueKey) {
321 if (normalizedMethod !== 'GET' && payload) {
322 // Using log.deprecated to log only once. We should add log.once or some such.
323 log.deprecated(`We've encountered a ${normalizedMethod} Request with a payload. `
324 + 'This is fine. Just letting you know that if your requests point to the same URL '
325 + 'and differ only in method and payload, you should see the "useExtendedUniqueKey" option of Request constructor.');
326 }
327 return normalizedUrl;
328 }
329 const payloadHash = payload ? this._hashPayload(payload) : '';
330 return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`;
331 }
332 _hashPayload(payload) {
333 return node_crypto_1.default
334 .createHash('sha256')
335 .update(payload)
336 .digest('base64')
337 .replace(/[+/=]/g, '')
338 .substring(0, 8);
339 }
340}
341exports.Request = Request;
342//# sourceMappingURL=request.js.map
\No newline at end of file