UNPKG

scrape-it/lib/index.js

Version:

4.01 kBJavaScriptView Raw

1"use strict";
2
3var req = require("cheerio-req"),
  typpy = require("typpy"),
  assured = require("assured"),
  scrapeHtml = require("scrape-it-core");
7
8/**
* scrapeIt
* A scraping module for humans.
*
* @name scrapeIt
* @function
* @param {String|Object} url The page url or request options.
* @param {Object} opts The options passed to `scrapeHTML` method.
* @param {Function} cb The callback function.
* @return {Promise} A promise object resolving with:
*
*   - `data` (Object): The scraped data.
*   - `$` (Function): The Cheeerio function. This may be handy to do some other manipulation on the DOM, if needed.
*   - `response` (Object): The response object.
*   - `body` (String): The raw body as a string.
*
*/
25function scrapeIt(url, opts, cb) {
  cb = assured(cb);
  req(url, function (err, $, res, body) {
      if (err) {
          return cb(err);
      }
      try {
          var scrapedData = scrapeIt.scrapeHTML($, opts);
          cb(null, {
              data: scrapedData,
              $: $,
              response: res,
              body: body
          });
      } catch (err) {
          cb(err);
      }
  });
  return cb._;
44}
45
46/**
* scrapeIt.scrapeHTML
* Scrapes the data in the provided element.
*
* For the format of the selector, please refer to the [Selectors section of the Cheerio library](https://github.com/cheeriojs/cheerio#-selector-context-root-)
*
* @name scrapeIt.scrapeHTML
* @function
* @param {Cheerio} $ The input element.
* @param {Object} opts An object containing the scraping information.
*
*   If you want to scrape a list, you have to use the `listItem` selector:
*
*    - `listItem` (String): The list item selector.
*    - `data` (Object): The fields to include in the list objects:
*       - `<fieldName>` (Object|String): The selector or an object containing:
*          - `selector` (String): The selector.
*          - `convert` (Function): An optional function to change the value.
*          - `how` (Function|String): A function or function name to access the
*            value.
*          - `attr` (String): If provided, the value will be taken based on
*            the attribute name.
*          - `trim` (Boolean): If `false`, the value will *not* be trimmed
*            (default: `true`).
*          - `closest` (String): If provided, returns the first ancestor of
*            the given element.
*          - `eq` (Number): If provided, it will select the *nth* element.
*          - `texteq` (Number): If provided, it will select the *nth* direct text child.
*            Deep text child selection is not possible yet.
*            Overwrites the `how` key.
*          - `listItem` (Object): An object, keeping the recursive schema of
*            the `listItem` object. This can be used to create nested lists.
*
*   **Example**:
*   ```js
*   {
*      articles: {
*          listItem: ".article"
*        , data: {
*              createdAt: {
*                  selector: ".date"
*                , convert: x => new Date(x)
*              }
*            , title: "a.article-title"
*            , tags: {
*                  listItem: ".tags > span"
*              }
*            , content: {
*                  selector: ".article-content"
*                , how: "html"
*              }
*            , traverseOtherNode: {
*                  selector: ".upperNode"
*                , closest: "div"
*                , convert: x => x.length
*              }
*          }
*      }
*   }
*   ```
*
*   If you want to collect specific data from the page, just use the same
*   schema used for the `data` field.
*
*   **Example**:
*   ```js
*   {
*        title: ".header h1"
*      , desc: ".header h2"
*      , avatar: {
*            selector: ".header img"
*          , attr: "src"
*        }
*   }
*   ```
*
* @returns {Object} The scraped data.
*/
124scrapeIt.scrapeHTML = scrapeHTML;
125
126module.exports = scrapeIt;
\No newline at end of file

1	`"use strict";`
2
3	`var req = require("cheerio-req"),`
4	`typpy = require("typpy"),`
5	`assured = require("assured"),`
6	`scrapeHtml = require("scrape-it-core");`
7
8	`/**`
9	`* scrapeIt`
10	`* A scraping module for humans.`
11	`*`
12	`* @name scrapeIt`
13	`* @function`
14	`* @param {String\|Object} url The page url or request options.`
15	* @param {Object} opts The options passed to `scrapeHTML` method.
16	`* @param {Function} cb The callback function.`
17	`* @return {Promise} A promise object resolving with:`
18	`*`
19	* - `data` (Object): The scraped data.
20	* - `$` (Function): The Cheeerio function. This may be handy to do some other manipulation on the DOM, if needed.
21	* - `response` (Object): The response object.
22	* - `body` (String): The raw body as a string.
23	`*`
24	`*/`
25	`function scrapeIt(url, opts, cb) {`
26	`cb = assured(cb);`
27	`req(url, function (err, $, res, body) {`
28	`if (err) {`
29	`return cb(err);`
30	`}`
31	`try {`
32	`var scrapedData = scrapeIt.scrapeHTML($, opts);`
33	`cb(null, {`
34	`data: scrapedData,`
35	`$: $,`
36	`response: res,`
37	`body: body`
38	`});`
39	`} catch (err) {`
40	`cb(err);`
41	`}`
42	`});`
43	`return cb._;`
44	`}`
45
46	`/**`
47	`* scrapeIt.scrapeHTML`
48	`* Scrapes the data in the provided element.`
49	`*`
50	`* For the format of the selector, please refer to the [Selectors section of the Cheerio library](https://github.com/cheeriojs/cheerio#-selector-context-root-)`
51	`*`
52	`* @name scrapeIt.scrapeHTML`
53	`* @function`
54	`* @param {Cheerio} $ The input element.`
55	`* @param {Object} opts An object containing the scraping information.`
56	`*`
57	* If you want to scrape a list, you have to use the `listItem` selector:
58	`*`
59	* - `listItem` (String): The list item selector.
60	* - `data` (Object): The fields to include in the list objects:
61	* - `<fieldName>` (Object\|String): The selector or an object containing:
62	* - `selector` (String): The selector.
63	* - `convert` (Function): An optional function to change the value.
64	* - `how` (Function\|String): A function or function name to access the
65	`* value.`
66	* - `attr` (String): If provided, the value will be taken based on
67	`* the attribute name.`
68	* - `trim` (Boolean): If `false`, the value will not be trimmed
69	* (default: `true`).
70	* - `closest` (String): If provided, returns the first ancestor of
71	`* the given element.`
72	* - `eq` (Number): If provided, it will select the nth element.
73	* - `texteq` (Number): If provided, it will select the nth direct text child.
74	`* Deep text child selection is not possible yet.`
75	* Overwrites the `how` key.
76	* - `listItem` (Object): An object, keeping the recursive schema of
77	* the `listItem` object. This can be used to create nested lists.
78	`*`
79	`* Example:`
80	* ```js
81	`* {`
82	`* articles: {`
83	`* listItem: ".article"`
84	`* , data: {`
85	`* createdAt: {`
86	`* selector: ".date"`
87	`* , convert: x => new Date(x)`
88	`* }`
89	`* , title: "a.article-title"`
90	`* , tags: {`
91	`* listItem: ".tags > span"`
92	`* }`
93	`* , content: {`
94	`* selector: ".article-content"`
95	`* , how: "html"`
96	`* }`
97	`* , traverseOtherNode: {`
98	`* selector: ".upperNode"`
99	`* , closest: "div"`
100	`* , convert: x => x.length`
101	`* }`
102	`* }`
103	`* }`
104	`* }`
105	* ```
106	`*`
107	`* If you want to collect specific data from the page, just use the same`
108	* schema used for the `data` field.
109	`*`
110	`* Example:`
111	* ```js
112	`* {`
113	`* title: ".header h1"`
114	`* , desc: ".header h2"`
115	`* , avatar: {`
116	`* selector: ".header img"`
117	`* , attr: "src"`
118	`* }`
119	`* }`
120	* ```
121	`*`
122	`* @returns {Object} The scraped data.`
123	`*/`
124	`scrapeIt.scrapeHTML = scrapeHTML;`
125
126	`module.exports = scrapeIt;`
\	No newline at end of file