1 | const dom = require('./dom');
|
2 |
|
3 | const SELECTOR_LIST = 'ol, ul';
|
4 | const SELECTOR_LINK = '> a, p > a';
|
5 | const SELECTOR_PART = 'h2, h3, h4';
|
6 |
|
7 | const utils = require('../utils');
|
8 |
|
9 |
|
10 | function findList($parent) {
|
11 | let $container = $parent.children('.olist');
|
12 | if ($container.length > 0) $parent = $container.first();
|
13 |
|
14 | return $parent.children(SELECTOR_LIST);
|
15 | }
|
16 |
|
17 | function parseList($ul, $) {
|
18 | let articles = [];
|
19 |
|
20 | $ul.children('li').each(function() {
|
21 | let article = {};
|
22 | let $li = $(this);
|
23 | let $p = $li.children('p');
|
24 | article.title = ($p.text() || dom.textNode($li.get(0))).trim();
|
25 |
|
26 | let $a = $li.find(SELECTOR_LINK);
|
27 | if ($a.length > 0) {
|
28 | article.title = $a.first().text();
|
29 | article.ref = $a.attr('href').replace(/\\/g, '/').replace(/^\/+/, '');
|
30 | }
|
31 |
|
32 | let $sub = findList($li);
|
33 | article.articles = parseList($sub, $);
|
34 |
|
35 | if (!article.title) return;
|
36 | articles.push(article);
|
37 | });
|
38 |
|
39 | return articles;
|
40 | }
|
41 |
|
42 | function findParts($parent, $) {
|
43 | let partsAndLists = $parent.children(SELECTOR_LIST + ', ' + SELECTOR_PART);
|
44 |
|
45 | let parts = [];
|
46 | let previousPart = null;
|
47 |
|
48 | partsAndLists.each(function (i, el) {
|
49 | if (isPartNode(el)) {
|
50 | if (previousPart !== null) {
|
51 | parts.push(previousPart);
|
52 | }
|
53 | previousPart = {
|
54 | title: getPartTitle(el, $),
|
55 | list: null
|
56 | };
|
57 |
|
58 | } else {
|
59 | if (previousPart !== null) {
|
60 | previousPart.list = el;
|
61 | } else {
|
62 | previousPart = {
|
63 | title: '',
|
64 | list: el
|
65 | };
|
66 | }
|
67 | parts.push(previousPart);
|
68 | previousPart = null;
|
69 | }
|
70 | });
|
71 |
|
72 | if (previousPart !== null) {
|
73 | parts.push(previousPart);
|
74 | }
|
75 |
|
76 | return parts;
|
77 | }
|
78 |
|
79 | function isPartNode(el) {
|
80 | return SELECTOR_PART.indexOf(el.name) !== -1;
|
81 | }
|
82 |
|
83 |
|
84 | function getPartTitle(el, $) {
|
85 | return $(el).text().trim();
|
86 | }
|
87 |
|
88 | function parseSummary(html) {
|
89 | let $ = dom.parse(html);
|
90 |
|
91 | let $root = dom.cleanup(dom.root($), $);
|
92 |
|
93 | let parts = findParts($root, $);
|
94 |
|
95 | let parsedParts = [];
|
96 | let part;
|
97 | for (let i = 0; i < parts.length; ++i) {
|
98 | part = parts[i];
|
99 | parsedParts.push({
|
100 | title: part.title,
|
101 | articles: parseList($(part.list), $)
|
102 | });
|
103 | }
|
104 | return parsedParts;
|
105 | }
|
106 |
|
107 | module.exports = parseSummary;
|