1 | ;
|
2 |
|
3 | var req = require("cheerio-req"),
|
4 | typpy = require("typpy"),
|
5 | assured = require("assured"),
|
6 | scrapeHtml = require("scrape-it-core");
|
7 |
|
8 | /**
|
9 | * scrapeIt
|
10 | * A scraping module for humans.
|
11 | *
|
12 | * @name scrapeIt
|
13 | * @function
|
14 | * @param {String|Object} url The page url or request options.
|
15 | * @param {Object} opts The options passed to `scrapeHTML` method.
|
16 | * @param {Function} cb The callback function.
|
17 | * @return {Promise} A promise object resolving with:
|
18 | *
|
19 | * - `data` (Object): The scraped data.
|
20 | * - `$` (Function): The Cheeerio function. This may be handy to do some other manipulation on the DOM, if needed.
|
21 | * - `response` (Object): The response object.
|
22 | * - `body` (String): The raw body as a string.
|
23 | *
|
24 | */
|
25 | function scrapeIt(url, opts, cb) {
|
26 | cb = assured(cb);
|
27 | req(url, function (err, $, res, body) {
|
28 | if (err) {
|
29 | return cb(err);
|
30 | }
|
31 | try {
|
32 | var scrapedData = scrapeIt.scrapeHTML($, opts);
|
33 | cb(null, {
|
34 | data: scrapedData,
|
35 | $: $,
|
36 | response: res,
|
37 | body: body
|
38 | });
|
39 | } catch (err) {
|
40 | cb(err);
|
41 | }
|
42 | });
|
43 | return cb._;
|
44 | }
|
45 |
|
46 | /**
|
47 | * scrapeIt.scrapeHTML
|
48 | * Scrapes the data in the provided element.
|
49 | *
|
50 | * For the format of the selector, please refer to the [Selectors section of the Cheerio library](https://github.com/cheeriojs/cheerio#-selector-context-root-)
|
51 | *
|
52 | * @name scrapeIt.scrapeHTML
|
53 | * @function
|
54 | * @param {Cheerio} $ The input element.
|
55 | * @param {Object} opts An object containing the scraping information.
|
56 | *
|
57 | * If you want to scrape a list, you have to use the `listItem` selector:
|
58 | *
|
59 | * - `listItem` (String): The list item selector.
|
60 | * - `data` (Object): The fields to include in the list objects:
|
61 | * - `<fieldName>` (Object|String): The selector or an object containing:
|
62 | * - `selector` (String): The selector.
|
63 | * - `convert` (Function): An optional function to change the value.
|
64 | * - `how` (Function|String): A function or function name to access the
|
65 | * value.
|
66 | * - `attr` (String): If provided, the value will be taken based on
|
67 | * the attribute name.
|
68 | * - `trim` (Boolean): If `false`, the value will *not* be trimmed
|
69 | * (default: `true`).
|
70 | * - `closest` (String): If provided, returns the first ancestor of
|
71 | * the given element.
|
72 | * - `eq` (Number): If provided, it will select the *nth* element.
|
73 | * - `texteq` (Number): If provided, it will select the *nth* direct text child.
|
74 | * Deep text child selection is not possible yet.
|
75 | * Overwrites the `how` key.
|
76 | * - `listItem` (Object): An object, keeping the recursive schema of
|
77 | * the `listItem` object. This can be used to create nested lists.
|
78 | *
|
79 | * **Example**:
|
80 | * ```js
|
81 | * {
|
82 | * articles: {
|
83 | * listItem: ".article"
|
84 | * , data: {
|
85 | * createdAt: {
|
86 | * selector: ".date"
|
87 | * , convert: x => new Date(x)
|
88 | * }
|
89 | * , title: "a.article-title"
|
90 | * , tags: {
|
91 | * listItem: ".tags > span"
|
92 | * }
|
93 | * , content: {
|
94 | * selector: ".article-content"
|
95 | * , how: "html"
|
96 | * }
|
97 | * , traverseOtherNode: {
|
98 | * selector: ".upperNode"
|
99 | * , closest: "div"
|
100 | * , convert: x => x.length
|
101 | * }
|
102 | * }
|
103 | * }
|
104 | * }
|
105 | * ```
|
106 | *
|
107 | * If you want to collect specific data from the page, just use the same
|
108 | * schema used for the `data` field.
|
109 | *
|
110 | * **Example**:
|
111 | * ```js
|
112 | * {
|
113 | * title: ".header h1"
|
114 | * , desc: ".header h2"
|
115 | * , avatar: {
|
116 | * selector: ".header img"
|
117 | * , attr: "src"
|
118 | * }
|
119 | * }
|
120 | * ```
|
121 | *
|
122 | * @returns {Object} The scraped data.
|
123 | */
|
124 | scrapeIt.scrapeHTML = scrapeHTML;
|
125 |
|
126 | module.exports = scrapeIt; |
\ | No newline at end of file |