UNPKG

5.74 kBJavaScriptView Raw
1//var request = require('request');
2const cheerio = require('cheerio');
3const http = require('http');
4const https = require('https');
5const async = require("async");
6
7const makeRequestHttp = function (link, context, callback) {
8 this.logger("REQUEST : " + link, "DEBUG");
9 let myurl = url.parse(link);
10 // cookie session
11 let headers = {};
12 if (context.session) {
13 headers.Cookie = context.session.name + "=" + context.session.id;
14 }
15 var options = {
16 hostname: myurl.hostname,
17 port: myurl.port,
18 path: myurl.path,
19 method: 'GET',
20 headers: headers
21 };
22 let wrapper = http.request;
23 //console.log(options)
24 let keepAliveAgent = null;
25 // https
26 if (myurl.protocol === "https:") {
27 // keepalive if multiple request in same socket
28 keepAliveAgent = new https.Agent({
29 keepAlive: true
30 });
31 // certificat
32 nodefony.extend(options, {
33 key: this.serverHttps.key,
34 cert: this.serverHttps.cert,
35 rejectUnauthorized: false,
36 requestCert: true,
37 agent: keepAliveAgent
38 });
39 wrapper = https.request;
40 } else {
41 // keepalive
42 keepAliveAgent = new http.Agent({
43 keepAlive: true
44 });
45 options.agent = keepAliveAgent;
46 }
47 const req = wrapper(options, (res) => {
48 var bodyRaw = "";
49 res.setEncoding('utf8');
50 res.on('data', (chunk) => {
51 //this.logger( chunk, "DEBUG");
52 bodyRaw += chunk;
53 });
54 res.on('end', () => {
55 parseLink.call(this, link, bodyRaw, callback);
56 });
57 });
58 req.on('error', (e) => {
59 this.logger('Problem with request: ' + e.message, "ERROR");
60 });
61 req.end();
62};
63
64let parseLink = function (crawlUrl, body, callback) {
65 let pageObject = {};
66 pageObject.links = [];
67
68 if (/^\//.test(crawlUrl)) {
69 pageObject.url = this.protocol + this.base + crawlUrl;
70 } else {
71 pageObject.url = crawlUrl;
72 }
73 let $ = cheerio.load(body, {
74 ignoreWhitespace: true
75 });
76 pageObject.title = $('title').text();
77 pageObject.selector = $;
78
79 // find link
80 $('a').each((i, elem) => {
81 //console.log(elem.attribs.href)
82 if (elem.attribs.href === "#" || elem.attribs.href === "/") {
83 return;
84 }
85 let href = null;
86 if (/^\//.test(elem.attribs.href)) {
87 href = url.parse(this.protocol + this.base + elem.attribs.href);
88 } else {
89 if (elem.attribs.href) {
90 href = url.parse(elem.attribs.href);
91 } else {
92 href = null;
93 }
94 }
95 if (href) {
96 pageObject.links.push({
97 linkText: $(elem).text(),
98 linkUrl: href
99 });
100 }
101 });
102 callback(null, pageObject);
103};
104
105const myLoop = function (link, context, finish, recurse) {
106 if (this.crawled[link]) {
107 if (this.crawled[link].page) {
108 finish(null, this.crawled);
109 return;
110 }
111 }
112 makeRequestHttp.call(this, link, context, (error, pageObject) => {
113 if (error) {
114 return;
115 }
116 this.crawled[pageObject.url] = [];
117 this.crawled[pageObject.url].page = pageObject;
118 async.eachSeries(pageObject.links, (item, cb) => {
119 if (item.linkUrl) {
120 // test if the url actually points to the same domain
121 if (item.linkUrl.host === this.base) {
122 if (!item.linkUrl.hash) {
123 this.crawled[pageObject.url].push(item.linkUrl.href);
124 }
125 }
126 }
127 cb(null);
128 }, (error) => {
129 if (!error) {
130 for (var i = 0; i < this.crawled[pageObject.url].length; i++) {
131 //console.log( this.crawled[pageObject.url] )
132 if (this.crawled[pageObject.url][i] in this.crawled) {
133 continue;
134 } else {
135 recurse++;
136 this.crawled[this.crawled[pageObject.url][i]] = [];
137 myLoop.call(this, this.crawled[pageObject.url][i], context, () => {
138 recurse--;
139 if (recurse === 0) {
140 //console.log("FINISH")
141 finish(error, this.crawled);
142 }
143 }, 0);
144 }
145 }
146 }
147 if (recurse === 0) {
148 //console.log( "FINISH 2" )
149 finish(error, this.crawled);
150 }
151 });
152 });
153};
154
155module.exports = class webCrawler extends nodefony.Service {
156
157 constructor(container, kernel) {
158 super("WEBCRAWLER", container, container.get("notificationsCenter"));
159 this.kernel = kernel;
160 this.crawled = {};
161 this.elastic = null;
162 this.serverHttps = this.get("httpsServer");
163 this.once( "onReady", () => {
164 this.elastic = this.kernel.getBundle("documentation").elastic;
165 });
166 }
167
168 siteAll(urlBase, search, context, callback) {
169 var recurse = 0;
170 var Link = url.parse(urlBase);
171 this.base = Link.host;
172 this.protocol = Link.protocol ? Link.protocol + "//" : 'http://';
173 if (this.elastic) {
174 myLoop.call(this, urlBase, context, function ( /*error, crawled*/ ) {});
175 } else {
176 myLoop.call(this, urlBase, context, (error, crawled) => {
177 //console.log(crawled)
178 var obj = {};
179 try {
180 for (var page in crawled) {
181
182 if (crawled && crawled[page] && crawled[page].page && crawled[page].page.selector) {
183 var text = crawled[page].page.selector("body").text();
184 if (!text) {
185 continue;
186 }
187 //var index = text.indexOf(search) ;
188 let reg = new RegExp(search, 'gi');
189 let index = text.search(reg);
190 if (index !== -1) {
191 obj[crawled[page].page.url] = {
192 text: "..." + text.substring(index - 100, index + 100) + "...",
193 title: crawled[page].page.title
194 };
195 }
196 }
197 }
198 } catch (e) {
199 this.logger(e, "ERROR");
200 }
201 callback(obj);
202 }, recurse);
203 }
204 }
205};