UNPKG

25.1 kBJavaScriptView Raw
1const debugLib = require('debug');
2const debug = debugLib('Parser');
3const clone = require('lodash.clone');
4const Actions = require('./Actions');
5const Transforms = require('./Transforms');
6const Storage = require('./Storage');
7const Scope = require('./Scope');
8const { waitForEvaluate } = require('./tools/wait');
9const MAX_MILESTONE_ATTEMPTS = 2;
10
11/**
12 * @typedef {object} Rule
13 * @property {?string} scope
14 * @property {?string} parentScope
15 * @property {?string} jsScope
16 * @property {?string} jsParentScope
17 * @property {string} name
18 * @property {?Array.<ActionOptions>} actions
19 * @property {?Array.<ActionOptions>} postActions
20 * @property {?(Grid|Collection)} collection
21 * @property {?Array.<TransformOptions>} transform
22 * @property {?boolean} rulesFromActions
23 * @property {?string} separator
24 * @property {?string} type
25 * @property {?string} attr
26 * @property {?string} prop
27 * @property {?number} child
28 * @property {?boolean|Function} id
29 * @property {?boolean} inject
30 * @property {?number} injectionTimeout
31 * @property {?object} catchError
32 * @property {string} get
33 * @property {string} set
34 * @property {string} add
35 * @property {string} unset
36 * @property {*} value
37 *
38 */
39
40/**
41 * @typedef {object} ActionOptions
42 * @property {string} type
43 * @property {?string} scope
44 * @property {?string} parentScope
45 * @property {?string} jsScope
46 * @property {?string} jsParentScope
47 * @property {?object} waitFor
48 * @property {string} waitFor.type
49 * @property {?object} waitForQuery
50 * @property {string} waitForQuery.uri pattern of uri which will be awaiting
51 * @property {string} waitForQuery.timeout
52 * @property {?boolean} waitForPage
53 * @property {?number} waitForPageTimeout
54 * @property {?boolean} once
55 * @property {?boolean} __done - set after action was performed first time
56 * @property {?Array.<ActionOptions>} cases
57 * @property {?Array.<ActionOptions>} conditions
58 * @property {?Array.<TransformOptions>} transform
59 * @property {?string} set
60 * @property {?object} change
61 * @property {?boolean} useActionsResult
62 */
63
64/**
65 * @typedef {ActionOptions} WaitAction
66 * @property {?number} timeout
67 */
68
69/**
70 * @typedef {Array.<Rule>} Collection
71 */
72
73/**
74 * @typedef {Array.<Array.<Rule>>} Grid
75 */
76
77/**
78 * @typedef {object} TransformOptions
79 * @property {string} type
80 */
81
82/**
83 * type=date
84 * @typedef {TransformOptions} DateTransform
85 * @property {?string} locale
86 * @property {string} from - date format for parsing
87 * @property {string} to - desired date format
88 */
89
90/**
91 * type=replace
92 * @typedef {TransformOptions} ReplaceTransform
93 * @property {?string} locale
94 * @property {Array.<string>} re - args for RegExp
95 * @property {string} to - string to replace to
96 */
97
98const RULE_TYPE = {
99 SIMPLE: 'simple',
100 COLLECTION: 'collection',
101 GRID: 'grid',
102 ACTIONS_RESULT: 'actionsResult',
103 GET: 'get',
104 VALUE: 'value',
105 INJECTION: 'injection'
106};
107
108const PARSING_MODE = {
109 SINGLE: 'single',
110 MULTIPLE: 'multiple',
111};
112
113class Parser {
114 /**
115 * @param {object} options
116 * @param {AbstractEnvironment} options.environment
117 * @param {?Paginator} options.paginator
118 * @param {?boolean} options.clearDom
119 */
120 constructor(options) {
121 if (!options.environment) {
122 throw new Error('\'environment\' should be specified');
123 }
124
125 this._env = options.environment;
126 this._paginator = options.paginator;
127 this.clearDom = options.clearDom || false;
128 this.mode = options.mode || 'single';
129 this._domScope = new Scope();
130 this._jsScope = new Scope();
131
132 /**
133 * @type {?Rule}
134 * @private
135 */
136 this._rules = null;
137
138 /**
139 * @type {Array}
140 * @private
141 */
142 this._preActions = null;
143
144 this._storage = new Storage({
145 'environment:options': this._env.getOptions()
146 });
147
148 this._actions = new Actions({
149 environment: this._env,
150 parser: this,
151 storage: this._storage
152 });
153
154 if (this._paginator) {
155 this._paginator
156 .setEnvironment(this._env)
157 .setActions(this._actions);
158 }
159
160 this._transforms = new Transforms({storage: this._storage});
161 }
162
163 /**
164 * @param {object} options
165 * @param {Rule} options.rules
166 * @param {String} options.url
167 * @param {Array.<ActionOptions>} options.actions
168 * @param {Array.<TransformOptions>} options.transform
169 * @returns {Promise}
170 */
171 async parse(options = {}) {
172 debug('.parse() has called');
173 this._rules = options.rules || {};
174 this._preActions = options.actions || null;
175
176 let results;
177
178 try {
179 await this._env.prepare();
180 if (this.mode === PARSING_MODE.MULTIPLE && options.url) {
181 await this._env.goto(options.url);
182 }
183
184 if (this._paginator) {
185 this._paginator.reset();
186 }
187
188 if (this._preActions) {
189 await this._actions.performActions(this._preActions);
190 }
191
192 results = await this._parseRootRule();
193
194 if (options.transform) {
195 results = this._transforms.produce(options.transform, results);
196 }
197
198 if (this._paginator) {
199 results = await this._paginate(results);
200 }
201 if (this.mode === PARSING_MODE.SINGLE) {
202 await this.finish();
203 }
204 } catch (e) {
205 await this.finish();
206 throw e;
207 }
208
209 return results;
210 }
211
212 async finish() {
213 try {
214 await this._env.tearDown();
215 } catch (e) {
216 try {
217 await this._env.snapshot('error');
218 await this._env.tearDown();
219 } catch (snapshotError) {
220 await this._env.tearDown();
221 }
222 throw e;
223 }
224 }
225
226 async moveYourFeet(stages) {
227 debug('Hit the road!');
228 const milestones = stages.milestones;
229 const edgeCases = stages.edgeCases;
230
231 try {
232 await this._env.prepare();
233 await milestones.reduce(async (promise, milestone) => {
234 await promise;
235 return this.passMilestone(milestone, edgeCases);
236 }, Promise.resolve());
237
238 await this._env.tearDown();
239 } catch (e) {
240 await this._env.snapshot('error');
241 this._env.tearDown();
242 throw e;
243 }
244 }
245
246 async passMilestone(milestone, edgeCases, attemptNumber) {
247 attemptNumber = attemptNumber || 0;
248 debug('Passing %o milestone, attempt #%s', milestone, attemptNumber);
249
250 try {
251 const result = await this._actions.performActions(milestone.condition);
252 if (!result) {
253 debug('Milestone condition failed');
254 return this.catchFailedMilestone(milestone, edgeCases, attemptNumber, 'Milestone condition failed');
255 }
256
257 debug('Milestone condition passed, passing milestone');
258 return this.processRule(milestone.rules, 0);
259 } catch (e) {
260 debug('Caught milestone error %o', e.stack || e);
261 return this.catchFailedMilestone(milestone, edgeCases, attemptNumber, e);
262 }
263 }
264
265 async catchFailedMilestone(milestone, edgeCases, attemptNumber, originalError) {
266 debug('Catching failing milestone');
267 if (attemptNumber > MAX_MILESTONE_ATTEMPTS) {
268 throw new Error(`Milestone failed more than ${MAX_MILESTONE_ATTEMPTS} times, original error: ${originalError.stack || originalError}`);
269 }
270
271 const edgeCasesHandled = await this.handleEdgeCases(edgeCases);
272 if (!edgeCasesHandled) {
273 debug('Catching edge cases failed');
274 return this.catchFailedMilestone(milestone, edgeCases, attemptNumber + 1, originalError);
275 }
276
277 debug('Edge case handled, another try to pass milestone');
278 return this.passMilestone(milestone, edgeCases, attemptNumber + 1);
279 }
280
281 async handleEdgeCases(edgeCases) {
282 return edgeCases.reduce(async (promise, edgeCase) => {
283 const result = await promise;
284 if (result) {
285 return Promise.resolve(result);
286 }
287
288 return this.handleEdgeCase(edgeCase);
289 }, Promise.resolve(false));
290 }
291
292 async handleEdgeCase(edgeCase) {
293 debug('Handling edge case %o', edgeCase);
294
295 try {
296 const result = await this._actions.performActions(edgeCase.condition);
297 if (!result) {
298 debug('Edge case condition failed');
299 return false;
300 }
301
302 debug('Edge case condition is true, trying to handle the case');
303 return this.processRule(edgeCase.rules, 0);
304 } catch (e) {
305 debug('Caught edge case error %o', e.stack || e);
306 return false;
307 }
308 }
309
310 /**
311 * @see {@link Actions#addAction}
312 */
313 addAction(type, action) {
314 return this._actions.addAction(type, action);
315 }
316
317 /**
318 * @see {@link Transforms#addTransform}
319 */
320 addTransform(type, transform) {
321 return this._transforms.addTransform(type, transform)
322 }
323
324 /**
325 * @param {number} [offset]
326 * @returns {Promise}
327 * @private
328 */
329 async _parseRootRule(offset) {
330 offset = offset || 0;
331 debug('Parsing root rule with offset: %s', offset);
332 return this.processRule(this._rules, offset);
333 }
334
335 /**
336 * @param {Rule} rule
337 * @param {number} [offset]
338 * @returns {Promise}
339 */
340 async processRule(rule, offset) {
341 debug('Process rule %o', rule);
342 let scopePushed = false;
343 let jsScopePushed = false;
344 if (rule.jsScope) {
345 this._jsScope.push(rule.jsScope, rule.jsParentScope);
346 jsScopePushed = true;
347 }
348 if (rule.scope) {
349 this._domScope.push(rule.scope, rule.parentScope);
350 scopePushed = true;
351 }
352 const domSelector = this._domScope.getSelector();
353
354 const actionsResult = await this._actions.performForRule(rule, domSelector);
355 let actionsScopePushed = false;
356 let actionsJsScopePushed = false;
357 if (rule.rulesFromActions) {
358 if (!actionsResult) {
359 throw new Error('Rule node marked with "rulesFromActions" flag should return rules from action. Got nothing.');
360 }
361 debug('Rules extracted from action %o', rule);
362 // use child transform or parent transform or nothing
363 actionsResult.transform = actionsResult.transform || rule.transform || false;
364 if (!('inject' in actionsResult)) {
365 actionsResult.inject = rule.inject;
366 }
367 if ('scope' in actionsResult) {
368 this._domScope.push(actionsResult.scope, actionsResult.parentScope);
369 actionsScopePushed = true;
370 }
371 if ('jsScope' in actionsResult) {
372 this._jsScope.push(actionsResult.jsScope, actionsResult.jsParentScope);
373 actionsJsScopePushed = true;
374 }
375
376 rule = actionsResult;
377
378 if ('actions' in rule) {
379 await this._actions.performForRule(rule, domSelector);
380 }
381 }
382
383 const results = await this._parseScope(rule, offset, actionsResult);
384
385 if (actionsScopePushed) {
386 this._domScope.pop();
387 }
388 if (actionsJsScopePushed) {
389 this._jsScope.pop();
390 }
391
392 await this._actions.performPostActionsForRule.bind(this._actions, rule, domSelector);
393
394 if (scopePushed) {
395 this._domScope.pop();
396 }
397 if (jsScopePushed) {
398 this._jsScope.pop();
399 }
400
401 return results;
402 }
403
404 /**
405 * Parse a scope
406 * @param {Rule} rule parsing rule
407 * @param {number} [offset] offset for GridRule
408 * @param {*} [actionsResults]
409 * @returns {Promise}
410 * @private
411 */
412 async _parseScope(rule, offset, actionsResults) {
413 let results;
414 const ruleType = this._getRuleType(rule);
415 debug('Parse %s rule', ruleType);
416 switch (ruleType) {
417 case RULE_TYPE.ACTIONS_RESULT:
418 results = actionsResults;
419 break;
420
421 case RULE_TYPE.GET:
422 results = this._storage.get(rule.get);
423 break;
424
425 case RULE_TYPE.VALUE:
426 results = rule.value;
427 break;
428
429 case RULE_TYPE.GRID:
430 results = await this._parseGridRule(rule, offset);
431 break;
432
433 case RULE_TYPE.COLLECTION:
434 results = await this._parseCollectionRule(rule);
435 break;
436
437 case RULE_TYPE.SIMPLE:
438 results = await this._parseSimpleRule(rule);
439 break;
440
441 case RULE_TYPE.INJECTION:
442 results = await this._injectBrowserRule(rule, offset, actionsResults);
443 break;
444 }
445
446 const extract = (results, ruleType, dataType) => {
447 if (
448 ruleType === RULE_TYPE.SIMPLE &&
449 dataType === 'array' &&
450 Array.isArray(results) &&
451 results.length === 1 &&
452 Array.isArray(results[0])
453 ) {
454 debug('Extracted %o', results[0]);
455 return results[0];
456 }
457
458 return results;
459 };
460
461 const format = results => {
462 if ([RULE_TYPE.SIMPLE, RULE_TYPE.GET, RULE_TYPE.VALUE, RULE_TYPE.COLLECTION].includes(ruleType)) {
463 if (Array.isArray(results) && rule.type !== 'array') {
464 return results.length === 1 ? results[0] : results.join(rule.separator || ' ');
465 }
466 if (!Array.isArray(results) && rule.type === 'array') {
467 return [results];
468 }
469 }
470
471 return results;
472 };
473
474 const updateResultsInStore = results => {
475 if (rule.set) {
476 this._storage.set(rule.set, results);
477 }
478 if (rule.add) {
479 const current = this._storage.get(rule.add) || [];
480 current.push(results);
481 this._storage.set(rule.add, current);
482 }
483 if (rule.unset) {
484 this._storage.unset(rule.unset);
485 }
486 return results;
487 };
488
489 if (!rule.transform) {
490 results = format(results);
491 return updateResultsInStore(results);
492 }
493
494 results = format(this.transform(results, rule.transform));
495 results = extract(results, ruleType, rule.type);
496 return updateResultsInStore(results);
497 }
498
499 /**
500 * Perform transformation on results
501 * @param results
502 * @param transform
503 * @returns {*}
504 */
505 transform(results, transform) {
506 if (Array.isArray(results)) {
507 results = results.map((result) => {
508 if (typeof result === 'string') {
509 result = result.trim();
510 }
511 return this._transforms.produce(transform, result);
512 }, this);
513 } else {
514 results = this._transforms.produce(transform, results);
515 }
516
517 return results;
518 }
519
520 /**
521 * Get rule type
522 * @param {Object} rule
523 * @returns {string}
524 */
525 _getRuleType(rule) {
526 if (rule.inject) {
527 return RULE_TYPE.INJECTION;
528 }
529
530 if (rule.useActionsResult) {
531 return RULE_TYPE.ACTIONS_RESULT;
532 }
533
534 if (rule.get) {
535 return RULE_TYPE.GET;
536 }
537
538 if (typeof rule.value !== 'undefined') {
539 return RULE_TYPE.VALUE;
540 }
541
542 const isCollection = Array.isArray(rule.collection);
543 if (isCollection) {
544 if (Array.isArray(rule.collection[0])) {
545 return RULE_TYPE.GRID;
546 }
547
548 return RULE_TYPE.COLLECTION;
549 }
550
551 return RULE_TYPE.SIMPLE;
552 }
553
554 /**
555 * Parse Grid rule
556 * @param {Rule} rule
557 * @param {number} [offset]
558 * @returns {Promise}
559 * @private
560 */
561 async _parseGridRule(rule, offset) {
562 debug('._parseGridRule() has called');
563 offset = offset || 0;
564 const maxItems = rule.maxItems || null;
565 const collection = rule.collection[0];
566 let nodesCount = await this._env.evaluateJs(
567 this._domScope.getSelector(),
568 this._jsScope.getSelector(),
569 /* istanbul ignore next */ function(domSelector, jsSelector) {
570 var domResult = domSelector && Sizzle(domSelector).length;
571 if (domSelector) {
572 return domResult;
573 }
574 var jsObject = jsSelector && eval(jsSelector);
575 if (jsObject && Array.isArray(jsObject)) {
576 return jsObject.length;
577 }
578 });
579 if (!nodesCount) {
580 return [];
581 }
582 if (maxItems && nodesCount > maxItems) {
583 nodesCount = maxItems;
584 }
585 debug('parsing %s nodes', nodesCount);
586
587 const scope = this._domScope.pop();
588 const jsScope = this._jsScope.pop();
589 const results = await this._parseRow({
590 collection: collection,
591 nodesCount: nodesCount - 1 - offset,
592 offset: offset,
593 scope: scope,
594 jsScope: jsScope,
595 results: []
596 });
597 if (scope) {
598 this._domScope.push(scope.scope, scope.parentScope);
599 }
600 if (jsScope) {
601 this._jsScope.push(jsScope.scope, jsScope.parentScope);
602 }
603
604 debug('._parseGridRule() results %o', results);
605 return results;
606 }
607
608 /**
609 * Parse row of Grid rule
610 * @param {object} options
611 * @returns {Promise}
612 * @private
613 */
614 async _parseRow(options) {
615 const {scope, jsScope} = options;
616 const domSelector = scope ? scope.scope + ':eq(' + options.offset + ')' : null;
617 const jsSelector = jsScope ? jsScope.scope + '[' + options.offset + ']' : null;
618 debug('._parseRow() has called for %s | %s', domSelector, jsSelector);
619 if (domSelector) {
620 this._domScope.push(domSelector, scope.parentScope);
621 }
622 if (jsSelector) {
623 this._jsScope.push(jsSelector, jsSelector.parentScope);
624 }
625
626 const row = await this._parseCollectionRule({
627 collection: options.collection
628 });
629 options.results.push(row);
630 if (domSelector) {
631 this._domScope.pop();
632 }
633 if (jsSelector) {
634 this._jsScope.pop();
635 }
636
637 options.nodesCount--;
638 if (options.nodesCount >= 0) {
639 options.offset++;
640 return this._parseRow(options);
641 }
642
643 const results = options.results;
644 if (this.clearDom) {
645 debug('clear parsed dom for %s', domSelector);
646 await this._env.evaluateJs(domSelector, /* istanbul ignore next */ function(domSelector) {
647 const parsedElement = Sizzle(domSelector)[0];
648 if (!parsedElement) {
649 return;
650 }
651 const boundingRect = parsedElement.getBoundingClientRect();
652 parsedElement.innerHTML = '';
653 parsedElement.style.height = boundingRect.height + 'px';
654 parsedElement.style.width = boundingRect.width + 'px';
655 });
656 }
657 return results;
658 }
659
660 async _injectBrowserRule(rule, offset, actionsResults) {
661 debug('._injectBrowserRule()');
662 let internalGooseResults, internalGooseError;
663 await this._env.injectBrowserEnv();
664 await this._env.evaluateJs(rule, offset, this._domScope, function(rule, offset, scopes) {
665 __gooseParse(rule, offset, scopes);
666 });
667 await waitForEvaluate(this._env,
668 () => {
669 return [__gooseResults, __gooseError];
670 },
671 resultsToCheck => {
672 internalGooseResults = resultsToCheck[0];
673 internalGooseError = resultsToCheck[1];
674 return internalGooseResults || internalGooseError;
675 },
676 () => false,
677 null,
678 rule.injectionTimeout
679 );
680
681 if (internalGooseError) {
682 throw internalGooseError;
683 }
684
685 return internalGooseResults;
686 }
687
688 /**
689 * Parse Collection rule
690 * @param {Rule} rule
691 * @returns {Promise}
692 * @private
693 */
694 async _parseCollectionRule(rule) {
695 debug('._parseCollectionRule() has called for rule %o', rule);
696
697 const collection = rule.collection;
698 const results = await collection.reduce(async (accumulator, rule) => {
699 accumulator = await accumulator;
700 let result = await this.processRule(rule);
701 let name;
702 switch (typeof rule.id) {
703 case 'boolean':
704 name = '_id';
705 break;
706 case 'function':
707 name = '_id';
708 result = rule.id.call(this, rule, result);
709 break;
710 default:
711 name = rule.name;
712 }
713 if (!rule.virtual) {
714 accumulator[name] = result;
715 }
716 return accumulator;
717 }, {});
718 debug('._parseCollectionRule() result %o', results);
719 return results;
720 }
721
722 /**
723 * @param {Rule} rule
724 * @returns {{type: string, value: string|number}}
725 * @private
726 */
727 _getSimpleRuleFilter(rule) {
728 const filter = {
729 type: 'text'
730 };
731 if (typeof rule.child !== 'undefined') {
732 filter.type = 'child';
733 filter.value = rule.child;
734 } else if (rule.attr) {
735 filter.type = 'attr';
736 filter.value = rule.attr;
737 } else if (rule.prop) {
738 filter.type = 'prop';
739 filter.value = rule.prop;
740 }
741
742 return filter;
743 }
744
745 /**
746 * Parse simple rule
747 * @param {Rule} rule
748 * @returns {Promise}
749 * @private
750 */
751 async _parseSimpleRule(rule) {
752 const selector = this._domScope.getSelector();
753 const jsSelector = rule.jsScope ? this._jsScope.getSelector() : '';
754 const filter = this._getSimpleRuleFilter(rule);
755 debug('._parseSimpleRule() has called for selector %s with filter %o', selector, filter);
756 const results = await this._env.evaluateJs(selector, jsSelector, filter, /* istanbul ignore next */ function(selector, jsSelector, filter) {
757 if (jsSelector) {
758 const value = eval(jsSelector);
759 return Array.isArray(value) ? value : [value];
760 }
761
762 const nodes = Sizzle(selector);
763 return nodes.map(function(node) {
764 switch (filter.type) {
765 case 'child':
766 const childNode = node.childNodes[filter.value];
767 return childNode ? childNode.textContent : '';
768 case 'attr':
769 if (typeof filter.value === 'object' && Array.isArray(filter.value.or)) {
770 const res = filter.value.or.map(function(value) {
771 return node.getAttribute(value);
772 }).filter(Boolean);
773 return res.pop();
774 }
775 return node.getAttribute(filter.value);
776 case 'prop':
777 return node[filter.value];
778 default:
779 return node.textContent;
780 }
781 });
782 });
783 if (!results) {
784 throw new Error('Error during querying selector: ' + (selector || jsSelector));
785 }
786 debug('._parseSimpleRule() result %o', results);
787 return results;
788 }
789
790 /**
791 * @param results
792 * @returns {Promise.<*>}
793 * @private
794 */
795 async _paginate(results) {
796 debug('Pagination...');
797 const pagination = await this._paginator.paginate();
798 if (pagination.done) {
799 return results;
800 }
801
802 const offset = this._paginator.resetCollectionOffsetOnNewPage() ? 0 : results.length;
803
804 const pageResults = await this._parseRootRule(offset);
805 debug('Pagination results %o', pageResults);
806 results = results.concat(pageResults);
807 const maxResults = this._paginator.getMaxResultsCount() - 1;
808 if (results.length > maxResults) {
809 results = results.slice(0, maxResults);
810 return results;
811 }
812 return this._paginate(results);
813 }
814}
815
816module.exports = Parser;