1 |
|
2 | 'use strict';
|
3 |
|
4 | var v = require('validator');
|
5 | var S = require('string');
|
6 | var R = require('ramda');
|
7 | var Joi = require('joi');
|
8 | var cheerio = require('cheerio');
|
9 | var uuid = require('uuid');
|
10 | var js2xmlparser = require('js2xmlparser');
|
11 | var Feed = require('feed');
|
12 |
|
13 |
|
14 |
|
15 |
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 |
|
26 | var getDefaultOptions = function() {
|
27 | return {
|
28 | filterKeywords: [],
|
29 | filterLocale: false,
|
30 | filterTexts: [],
|
31 | limit: null,
|
32 | output: 'json',
|
33 | search: [],
|
34 | selector: 'h1, h2, h3, h4, h5, h6, p',
|
35 | url: null
|
36 | };
|
37 | };
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 | var getOptionsSchema = function() {
|
44 | return Joi.object().keys({
|
45 | filterKeywords: Joi.array(),
|
46 | filterLocale: Joi.string().valid('en'),
|
47 | filterTexts: Joi.array(),
|
48 | limit: Joi.number().integer(),
|
49 | output: Joi.string().valid('json', 'xml', 'atom', 'rss'),
|
50 | search: Joi.array(),
|
51 | selector: Joi.string(),
|
52 | url: Joi.string(),
|
53 | contractAdjecent: Joi.boolean()
|
54 | });
|
55 | };
|
56 |
|
57 |
|
58 |
|
59 |
|
60 |
|
61 |
|
62 | var getOptions = function(options) {
|
63 |
|
64 | if (options === undefined) {
|
65 | options = {};
|
66 | } else {
|
67 | Joi.validate(options, getOptionsSchema(), function(error) {
|
68 | if (error) {
|
69 | throw error;
|
70 | }
|
71 | });
|
72 | }
|
73 |
|
74 |
|
75 | return R.merge(getDefaultOptions(), options);
|
76 | };
|
77 |
|
78 |
|
79 |
|
80 |
|
81 |
|
82 |
|
83 |
|
84 |
|
85 | var getCheerioClosestHref = function(domObject, url) {
|
86 | var href = null;
|
87 |
|
88 |
|
89 | if (domObject[0].name && domObject[0].name === 'a') {
|
90 | href = domObject.attr('href');
|
91 | }
|
92 |
|
93 |
|
94 | else if (domObject.children('a').length > 0) {
|
95 | href = domObject.children('a').attr('href');
|
96 | }
|
97 |
|
98 |
|
99 | else if (domObject.find('a').length > 0) {
|
100 | href = domObject.find('a').attr('href');
|
101 | }
|
102 |
|
103 |
|
104 | else if (domObject.closest('a').length > 0) {
|
105 | href = domObject.closest('a').attr('href');
|
106 | }
|
107 |
|
108 |
|
109 | else if (domObject.next('a').length > 0) {
|
110 | href = domObject.next('a').attr('href');
|
111 | }
|
112 |
|
113 |
|
114 | if (href === '#') {
|
115 | return url;
|
116 | }
|
117 |
|
118 |
|
119 | if (url &&
|
120 | href &&
|
121 | !v.isURL(href) &&
|
122 | href.indexOf('http') !== 0) {
|
123 | href = url + href;
|
124 | }
|
125 |
|
126 |
|
127 | if (!href) {
|
128 | return url;
|
129 | }
|
130 |
|
131 | return href;
|
132 | };
|
133 |
|
134 |
|
135 |
|
136 |
|
137 |
|
138 |
|
139 | var sanitizeText = function(text) {
|
140 | return S(text)
|
141 | .unescapeHTML()
|
142 | .stripTags()
|
143 | .collapseWhitespace()
|
144 | .s
|
145 | .trim();
|
146 | };
|
147 |
|
148 |
|
149 |
|
150 |
|
151 |
|
152 |
|
153 |
|
154 | var removeSpecials = function(text) {
|
155 | var whitelist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' '];
|
156 | var spacelist = ['-'];
|
157 | var lower = text.toLowerCase();
|
158 | var upper = text.toUpperCase();
|
159 | var result = '';
|
160 |
|
161 | for (var i = 0; i < lower.length; ++i) {
|
162 | if (lower[i] !== upper[i] || whitelist.indexOf(lower[i]) > -1) {
|
163 | result += (spacelist.indexOf(lower[i]) > -1 ? ' ' : text[i]);
|
164 | }
|
165 | }
|
166 |
|
167 | return result;
|
168 | };
|
169 |
|
170 |
|
171 |
|
172 |
|
173 |
|
174 |
|
175 | var simplifyText = function(text) {
|
176 | var result;
|
177 |
|
178 | if (R.isArrayLike(text)) {
|
179 | return R.map(simplifyText, text);
|
180 | }
|
181 |
|
182 | result = removeSpecials(text).toLowerCase();
|
183 | result = S(result).collapseWhitespace().s.trim();
|
184 | return result;
|
185 | };
|
186 |
|
187 |
|
188 |
|
189 |
|
190 |
|
191 |
|
192 |
|
193 | var getCheerioMetaTag = function($, name) {
|
194 | return $('meta').filter(function() {
|
195 | return $(this).attr('name') === name;
|
196 | }).map(function() {
|
197 | return $(this).attr('content');
|
198 | }).get().join();
|
199 | };
|
200 |
|
201 |
|
202 |
|
203 |
|
204 |
|
205 |
|
206 |
|
207 | var countKeywords = function(text, keywords) {
|
208 | return R.map(function(keyword) {
|
209 | return {
|
210 | word: keyword,
|
211 | count: S(text).count(keyword)
|
212 | };
|
213 | }, keywords);
|
214 | };
|
215 |
|
216 |
|
217 |
|
218 |
|
219 |
|
220 |
|
221 |
|
222 | var getKeywordsFromText = function(text, stopWords) {
|
223 | var simplifiedText = simplifyText(text);
|
224 | var keywords = R.uniq(simplifiedText.split(' '));
|
225 | var filtered = R.difference(keywords, stopWords);
|
226 |
|
227 | return countKeywords(simplifiedText, filtered);
|
228 | };
|
229 |
|
230 |
|
231 |
|
232 |
|
233 |
|
234 |
|
235 |
|
236 |
|
237 |
|
238 |
|
239 |
|
240 |
|
241 |
|
242 | var getCheerioMeta = function($, url) {
|
243 | return {
|
244 | author: getCheerioMetaTag($, 'author'),
|
245 | description: getCheerioMetaTag($, 'description'),
|
246 | url: url,
|
247 | time: Date.now(),
|
248 | title: sanitizeText($('title').text())
|
249 | };
|
250 | };
|
251 |
|
252 |
|
253 |
|
254 |
|
255 |
|
256 |
|
257 |
|
258 |
|
259 |
|
260 | var getNuggetsMeta = function(nuggets) {
|
261 | return {
|
262 | url: nuggets[0].source,
|
263 | time: nuggets[0].timestamp
|
264 | };
|
265 | };
|
266 |
|
267 |
|
268 |
|
269 |
|
270 |
|
271 | var getNuggetsKeywordTexts = R.pipe(
|
272 | R.map(function(nugget) { return nugget.keywords; }),
|
273 | R.flatten(),
|
274 | R.map(function(keyword) { return keyword.word; }),
|
275 | R.uniq()
|
276 | );
|
277 |
|
278 |
|
279 |
|
280 |
|
281 |
|
282 |
|
283 | var outputToXml = function(batch) {
|
284 | var nuggets = R.forEach(function(nugget) {
|
285 | nugget.temp = nugget.keywords;
|
286 | nugget.keywords = {};
|
287 | nugget.keywords.keyword = [];
|
288 | nugget.keywords.keyword.push(nugget.temp);
|
289 | delete nugget.temp;
|
290 | }, batch.nuggets);
|
291 |
|
292 | return js2xmlparser.parse('goldwasher', {nugget: nuggets});
|
293 | };
|
294 |
|
295 |
|
296 |
|
297 |
|
298 |
|
299 |
|
300 |
|
301 | var outputToFeed = function(batch, output) {
|
302 | var feed = new Feed({
|
303 | description: batch.meta.description,
|
304 | link: batch.meta.url,
|
305 | title: batch.meta.title,
|
306 | id: 'goldwasher-' + batch.meta.url
|
307 | });
|
308 |
|
309 | R.forEach(function(keyword) {
|
310 | feed.addCategory(keyword);
|
311 | }, getNuggetsKeywordTexts(batch.nuggets));
|
312 |
|
313 | R.forEach(function(nugget) {
|
314 | feed.addItem({
|
315 | author: [{
|
316 | name: batch.meta.author,
|
317 | link: batch.meta.url
|
318 | }],
|
319 | date: new Date(),
|
320 | description: nugget.text,
|
321 | guid: nugget.href,
|
322 | link: nugget.href,
|
323 | title: nugget.text
|
324 | });
|
325 | }, batch.nuggets);
|
326 |
|
327 | return feed.render(output);
|
328 | };
|
329 |
|
330 |
|
331 |
|
332 |
|
333 |
|
334 |
|
335 |
|
336 | var textSearch = function(text, list) {
|
337 | return !!R.filter(function(listItem) {
|
338 | return S(text).contains(listItem);
|
339 | }, list).length;
|
340 | };
|
341 |
|
342 |
|
343 |
|
344 |
|
345 |
|
346 |
|
347 |
|
348 |
|
349 |
|
350 |
|
351 | var nuggetFilter = function(text, stopTexts, search) {
|
352 | if (R.difference([text], stopTexts).length === 0) {
|
353 | return true;
|
354 | }
|
355 |
|
356 | if (search.length > 0) {
|
357 | if (!textSearch(text, search)) {
|
358 | return true;
|
359 | }
|
360 | }
|
361 | };
|
362 |
|
363 |
|
364 |
|
365 |
|
366 |
|
367 |
|
368 |
|
369 | var getStopwords = function(filterKeywords, filterLocale) {
|
370 | var stopWords = filterKeywords;
|
371 | var localeJson;
|
372 |
|
373 | if (filterLocale) {
|
374 | localeJson = require('../stop_words/' + filterLocale + '.json');
|
375 | stopWords = R.concat(
|
376 | stopWords,
|
377 | localeJson.stopWords);
|
378 | }
|
379 |
|
380 | stopWords = simplifyText(stopWords);
|
381 |
|
382 | return stopWords;
|
383 | };
|
384 |
|
385 |
|
386 |
|
387 |
|
388 |
|
389 |
|
390 |
|
391 |
|
392 |
|
393 |
|
394 |
|
395 |
|
396 |
|
397 |
|
398 |
|
399 | var inputFromCheerio = function($, options) {
|
400 | var meta = getCheerioMeta($, options.url);
|
401 | var scraped = $(options.selector);
|
402 | var nuggets = [];
|
403 | var batchUuid = uuid.v1();
|
404 | var simpleSearch = simplifyText(options.search);
|
405 | var simpleStopTexts = simplifyText(options.filterTexts);
|
406 | var simpleStopWords = getStopwords(
|
407 | options.filterKeywords,
|
408 | options.filterLocale
|
409 | );
|
410 |
|
411 | scraped.each(function() {
|
412 | var nugget = {};
|
413 | var current = $(this)[0];
|
414 | var prev = $(this).prev()[0];
|
415 | var next = $(this).next()[0];
|
416 | var text = $(this).text();
|
417 |
|
418 | if (options.contractAdjecent) {
|
419 | if (next && next.name && next.name === current.name) {
|
420 | return;
|
421 | } else if (prev && prev.name && prev.name === current.name) {
|
422 | var siblings = $(this).prevAll(current.name);
|
423 | siblings.each(function() {
|
424 | text = $(this).text() + ' ' + text;
|
425 | });
|
426 | }
|
427 | }
|
428 |
|
429 | text = sanitizeText(text);
|
430 | var simpleText = simplifyText(text);
|
431 |
|
432 | if (simpleText === '' || simpleText === null) {
|
433 | return;
|
434 | }
|
435 |
|
436 | if (nuggetFilter(
|
437 | simpleText,
|
438 | simpleStopTexts,
|
439 | simpleSearch
|
440 | )) {
|
441 | return;
|
442 | }
|
443 |
|
444 | nugget.source = options.url;
|
445 | nugget.href = getCheerioClosestHref($(this), options.url);
|
446 | nugget.tag = $(this)[0].name;
|
447 | nugget.text = text;
|
448 | nugget.timestamp = meta.time;
|
449 | nugget.uuid = uuid.v1();
|
450 | nugget.batch = batchUuid;
|
451 | nugget.keywords = getKeywordsFromText(text, simpleStopWords);
|
452 |
|
453 | nuggets.push(nugget);
|
454 | });
|
455 |
|
456 | if (options.limit) {
|
457 | nuggets = R.take(options.limit, nuggets);
|
458 | }
|
459 |
|
460 | var index = 0;
|
461 | nuggets = R.forEach(function(nugget) {
|
462 | nugget.total = nuggets.length;
|
463 | nugget.position = index;
|
464 | index++;
|
465 | }, nuggets);
|
466 |
|
467 | return {
|
468 | meta: meta,
|
469 | nuggets: nuggets
|
470 | };
|
471 | };
|
472 |
|
473 |
|
474 |
|
475 |
|
476 |
|
477 |
|
478 | var detectInputType = function(input) {
|
479 | if (R.is(String, input)) {
|
480 |
|
481 |
|
482 | if (S(input).contains('<?xml version="1.0" encoding="UTF-8"?>')) {
|
483 | return 'xml';
|
484 | }
|
485 |
|
486 |
|
487 | if (S(input).contains('<')) {
|
488 | return 'html';
|
489 | }
|
490 |
|
491 | throw new Error('Could not determine input type. (string)');
|
492 | } else {
|
493 |
|
494 |
|
495 | if (R.has('parseHTML', input)) {
|
496 | return 'cheerio';
|
497 | }
|
498 |
|
499 | if (R.isArrayLike(input)) {
|
500 | if (R.has('timestamp', input[0])) {
|
501 | return 'array';
|
502 | }
|
503 | }
|
504 |
|
505 | throw new Error('Could not determine input type. (cheerio/array)');
|
506 | }
|
507 | };
|
508 |
|
509 |
|
510 |
|
511 |
|
512 |
|
513 |
|
514 |
|
515 |
|
516 | var goldwasher = function(input, userOptions) {
|
517 | var batch;
|
518 | var options = getOptions(userOptions);
|
519 | options.input = detectInputType(input);
|
520 |
|
521 |
|
522 | if (options.input === 'array') {
|
523 | batch = {
|
524 | meta: getNuggetsMeta(input),
|
525 | nuggets: input
|
526 | };
|
527 | } else {
|
528 | if (options.input !== 'cheerio') {
|
529 | batch = inputFromCheerio(cheerio.load(input), options);
|
530 | } else {
|
531 | batch = inputFromCheerio(input, options);
|
532 | }
|
533 | }
|
534 |
|
535 |
|
536 | if (options.output === 'xml') {
|
537 | return outputToXml(batch);
|
538 | }
|
539 |
|
540 | if (options.output === 'atom') {
|
541 | return outputToFeed(batch, 'atom-1.0');
|
542 | }
|
543 |
|
544 | if (options.output === 'rss') {
|
545 | return outputToFeed(batch, 'rss-2.0');
|
546 | }
|
547 |
|
548 |
|
549 | return batch.nuggets;
|
550 | };
|
551 |
|
552 | module.exports = goldwasher; |
\ | No newline at end of file |