1 | const cheerio = require('cheerio');
|
2 | const _ = require('underscore');
|
3 |
|
4 | function parse(html) {
|
5 | let $ = cheerio.load(html, {
|
6 | decodeEntities: false
|
7 | });
|
8 | let $el = $('html, body').first();
|
9 | return $el.length > 0 ? $el : $;
|
10 | }
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 | function root($) {
|
17 | let $el = $('html, body, > div').first();
|
18 | return $el.length > 0 ? $el : $.root();
|
19 | }
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 | function textNode($el) {
|
26 | return _.reduce($el.children, function (text, e) {
|
27 | if (e.type == 'text') text += e.data;
|
28 | return text;
|
29 | }, '');
|
30 | }
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 | function cleanup($el, $) {
|
38 | $el.find('div').each(function () {
|
39 | let $div = $(this);
|
40 | cleanup($div, $);
|
41 |
|
42 | $div.replaceWith($div.html());
|
43 | });
|
44 |
|
45 | return $el;
|
46 | }
|
47 |
|
48 | module.exports = {
|
49 | parse: parse,
|
50 | textNode: textNode,
|
51 | root: root,
|
52 | cleanup: cleanup
|
53 | };
|