1 |
|
2 | const cheerio = require('cheerio');
|
3 | const http = require('http');
|
4 | const https = require('https');
|
5 | const async = require("async");
|
6 |
|
7 | const makeRequestHttp = function (link, context, callback) {
|
8 | this.logger("REQUEST : " + link, "DEBUG");
|
9 | let myurl = url.parse(link);
|
10 |
|
11 | let headers = {};
|
12 | if (context.session) {
|
13 | headers.Cookie = context.session.name + "=" + context.session.id;
|
14 | }
|
15 | var options = {
|
16 | hostname: myurl.hostname,
|
17 | port: myurl.port,
|
18 | path: myurl.path,
|
19 | method: 'GET',
|
20 | headers: headers
|
21 | };
|
22 | let wrapper = http.request;
|
23 |
|
24 | let keepAliveAgent = null;
|
25 |
|
26 | if (myurl.protocol === "https:") {
|
27 |
|
28 | keepAliveAgent = new https.Agent({
|
29 | keepAlive: true
|
30 | });
|
31 |
|
32 | nodefony.extend(options, {
|
33 | key: this.serverHttps.key,
|
34 | cert: this.serverHttps.cert,
|
35 | rejectUnauthorized: false,
|
36 | requestCert: true,
|
37 | agent: keepAliveAgent
|
38 | });
|
39 | wrapper = https.request;
|
40 | } else {
|
41 |
|
42 | keepAliveAgent = new http.Agent({
|
43 | keepAlive: true
|
44 | });
|
45 | options.agent = keepAliveAgent;
|
46 | }
|
47 | const req = wrapper(options, (res) => {
|
48 | var bodyRaw = "";
|
49 | res.setEncoding('utf8');
|
50 | res.on('data', (chunk) => {
|
51 |
|
52 | bodyRaw += chunk;
|
53 | });
|
54 | res.on('end', () => {
|
55 | parseLink.call(this, link, bodyRaw, callback);
|
56 | });
|
57 | });
|
58 | req.on('error', (e) => {
|
59 | this.logger('Problem with request: ' + e.message, "ERROR");
|
60 | });
|
61 | req.end();
|
62 | };
|
63 |
|
64 | let parseLink = function (crawlUrl, body, callback) {
|
65 | let pageObject = {};
|
66 | pageObject.links = [];
|
67 |
|
68 | if (/^\//.test(crawlUrl)) {
|
69 | pageObject.url = this.protocol + this.base + crawlUrl;
|
70 | } else {
|
71 | pageObject.url = crawlUrl;
|
72 | }
|
73 | let $ = cheerio.load(body, {
|
74 | ignoreWhitespace: true
|
75 | });
|
76 | pageObject.title = $('title').text();
|
77 | pageObject.selector = $;
|
78 |
|
79 |
|
80 | $('a').each((i, elem) => {
|
81 |
|
82 | if (elem.attribs.href === "#" || elem.attribs.href === "/") {
|
83 | return;
|
84 | }
|
85 | let href = null;
|
86 | if (/^\//.test(elem.attribs.href)) {
|
87 | href = url.parse(this.protocol + this.base + elem.attribs.href);
|
88 | } else {
|
89 | if (elem.attribs.href) {
|
90 | href = url.parse(elem.attribs.href);
|
91 | } else {
|
92 | href = null;
|
93 | }
|
94 | }
|
95 | if (href) {
|
96 | pageObject.links.push({
|
97 | linkText: $(elem).text(),
|
98 | linkUrl: href
|
99 | });
|
100 | }
|
101 | });
|
102 | callback(null, pageObject);
|
103 | };
|
104 |
|
105 | const myLoop = function (link, context, finish, recurse) {
|
106 | if (this.crawled[link]) {
|
107 | if (this.crawled[link].page) {
|
108 | finish(null, this.crawled);
|
109 | return;
|
110 | }
|
111 | }
|
112 | makeRequestHttp.call(this, link, context, (error, pageObject) => {
|
113 | if (error) {
|
114 | return;
|
115 | }
|
116 | this.crawled[pageObject.url] = [];
|
117 | this.crawled[pageObject.url].page = pageObject;
|
118 | async.eachSeries(pageObject.links, (item, cb) => {
|
119 | if (item.linkUrl) {
|
120 |
|
121 | if (item.linkUrl.host === this.base) {
|
122 | if (!item.linkUrl.hash) {
|
123 | this.crawled[pageObject.url].push(item.linkUrl.href);
|
124 | }
|
125 | }
|
126 | }
|
127 | cb(null);
|
128 | }, (error) => {
|
129 | if (!error) {
|
130 | for (var i = 0; i < this.crawled[pageObject.url].length; i++) {
|
131 |
|
132 | if (this.crawled[pageObject.url][i] in this.crawled) {
|
133 | continue;
|
134 | } else {
|
135 | recurse++;
|
136 | this.crawled[this.crawled[pageObject.url][i]] = [];
|
137 | myLoop.call(this, this.crawled[pageObject.url][i], context, () => {
|
138 | recurse--;
|
139 | if (recurse === 0) {
|
140 |
|
141 | finish(error, this.crawled);
|
142 | }
|
143 | }, 0);
|
144 | }
|
145 | }
|
146 | }
|
147 | if (recurse === 0) {
|
148 |
|
149 | finish(error, this.crawled);
|
150 | }
|
151 | });
|
152 | });
|
153 | };
|
154 |
|
155 | module.exports = class webCrawler extends nodefony.Service {
|
156 |
|
157 | constructor(container, kernel) {
|
158 | super("WEBCRAWLER", container, container.get("notificationsCenter"));
|
159 | this.kernel = kernel;
|
160 | this.crawled = {};
|
161 | this.elastic = null;
|
162 | this.serverHttps = this.get("httpsServer");
|
163 | this.once( "onReady", () => {
|
164 | this.elastic = this.kernel.getBundle("documentation").elastic;
|
165 | });
|
166 | }
|
167 |
|
168 | siteAll(urlBase, search, context, callback) {
|
169 | var recurse = 0;
|
170 | var Link = url.parse(urlBase);
|
171 | this.base = Link.host;
|
172 | this.protocol = Link.protocol ? Link.protocol + "//" : 'http://';
|
173 | if (this.elastic) {
|
174 | myLoop.call(this, urlBase, context, function ( /*error, crawled*/ ) {});
|
175 | } else {
|
176 | myLoop.call(this, urlBase, context, (error, crawled) => {
|
177 |
|
178 | var obj = {};
|
179 | try {
|
180 | for (var page in crawled) {
|
181 |
|
182 | if (crawled && crawled[page] && crawled[page].page && crawled[page].page.selector) {
|
183 | var text = crawled[page].page.selector("body").text();
|
184 | if (!text) {
|
185 | continue;
|
186 | }
|
187 |
|
188 | let reg = new RegExp(search, 'gi');
|
189 | let index = text.search(reg);
|
190 | if (index !== -1) {
|
191 | obj[crawled[page].page.url] = {
|
192 | text: "..." + text.substring(index - 100, index + 100) + "...",
|
193 | title: crawled[page].page.title
|
194 | };
|
195 | }
|
196 | }
|
197 | }
|
198 | } catch (e) {
|
199 | this.logger(e, "ERROR");
|
200 | }
|
201 | callback(obj);
|
202 | }, recurse);
|
203 | }
|
204 | }
|
205 | };
|