UNPKG

4.01 kBJavaScriptView Raw
1"use strict";
2
3var req = require("cheerio-req"),
4 typpy = require("typpy"),
5 assured = require("assured"),
6 scrapeHtml = require("scrape-it-core");
7
8/**
9 * scrapeIt
10 * A scraping module for humans.
11 *
12 * @name scrapeIt
13 * @function
14 * @param {String|Object} url The page url or request options.
15 * @param {Object} opts The options passed to `scrapeHTML` method.
16 * @param {Function} cb The callback function.
17 * @return {Promise} A promise object resolving with:
18 *
19 * - `data` (Object): The scraped data.
20 * - `$` (Function): The Cheeerio function. This may be handy to do some other manipulation on the DOM, if needed.
21 * - `response` (Object): The response object.
22 * - `body` (String): The raw body as a string.
23 *
24 */
25function scrapeIt(url, opts, cb) {
26 cb = assured(cb);
27 req(url, function (err, $, res, body) {
28 if (err) {
29 return cb(err);
30 }
31 try {
32 var scrapedData = scrapeIt.scrapeHTML($, opts);
33 cb(null, {
34 data: scrapedData,
35 $: $,
36 response: res,
37 body: body
38 });
39 } catch (err) {
40 cb(err);
41 }
42 });
43 return cb._;
44}
45
46/**
47 * scrapeIt.scrapeHTML
48 * Scrapes the data in the provided element.
49 *
50 * For the format of the selector, please refer to the [Selectors section of the Cheerio library](https://github.com/cheeriojs/cheerio#-selector-context-root-)
51 *
52 * @name scrapeIt.scrapeHTML
53 * @function
54 * @param {Cheerio} $ The input element.
55 * @param {Object} opts An object containing the scraping information.
56 *
57 * If you want to scrape a list, you have to use the `listItem` selector:
58 *
59 * - `listItem` (String): The list item selector.
60 * - `data` (Object): The fields to include in the list objects:
61 * - `<fieldName>` (Object|String): The selector or an object containing:
62 * - `selector` (String): The selector.
63 * - `convert` (Function): An optional function to change the value.
64 * - `how` (Function|String): A function or function name to access the
65 * value.
66 * - `attr` (String): If provided, the value will be taken based on
67 * the attribute name.
68 * - `trim` (Boolean): If `false`, the value will *not* be trimmed
69 * (default: `true`).
70 * - `closest` (String): If provided, returns the first ancestor of
71 * the given element.
72 * - `eq` (Number): If provided, it will select the *nth* element.
73 * - `texteq` (Number): If provided, it will select the *nth* direct text child.
74 * Deep text child selection is not possible yet.
75 * Overwrites the `how` key.
76 * - `listItem` (Object): An object, keeping the recursive schema of
77 * the `listItem` object. This can be used to create nested lists.
78 *
79 * **Example**:
80 * ```js
81 * {
82 * articles: {
83 * listItem: ".article"
84 * , data: {
85 * createdAt: {
86 * selector: ".date"
87 * , convert: x => new Date(x)
88 * }
89 * , title: "a.article-title"
90 * , tags: {
91 * listItem: ".tags > span"
92 * }
93 * , content: {
94 * selector: ".article-content"
95 * , how: "html"
96 * }
97 * , traverseOtherNode: {
98 * selector: ".upperNode"
99 * , closest: "div"
100 * , convert: x => x.length
101 * }
102 * }
103 * }
104 * }
105 * ```
106 *
107 * If you want to collect specific data from the page, just use the same
108 * schema used for the `data` field.
109 *
110 * **Example**:
111 * ```js
112 * {
113 * title: ".header h1"
114 * , desc: ".header h2"
115 * , avatar: {
116 * selector: ".header img"
117 * , attr: "src"
118 * }
119 * }
120 * ```
121 *
122 * @returns {Object} The scraped data.
123 */
124scrapeIt.scrapeHTML = scrapeHTML;
125
126module.exports = scrapeIt;
\No newline at end of file