UNPKG

12.4 kBJavaScriptView Raw
1// jscs:disable requirePaddingNewLinesAfterBlocks
2'use strict';
3
4var v = require('validator');
5var S = require('string');
6var R = require('ramda');
7var Joi = require('joi');
8var cheerio = require('cheerio');
9var uuid = require('uuid');
10var js2xmlparser = require('js2xmlparser');
11var Feed = require('feed');
12
13/**
14 * Gets an object with the default options.
15 * @returns {{
16 * filterKeywords: Array,
17 * filterLocale: boolean,
18 * filterTexts: Array,
19 * limit: null,
20 * output: string,
21 * search: Array,
22 * selector: string,
23 * url: null
24 *}}
25 */
26var getDefaultOptions = function() {
27 return {
28 filterKeywords: [],
29 filterLocale: false,
30 filterTexts: [],
31 limit: null,
32 output: 'json',
33 search: [],
34 selector: 'h1, h2, h3, h4, h5, h6, p',
35 url: null
36 };
37};
38
39/**
40 * Gets the validation schema for Joi.
41 * @returns {*|Array}
42 */
43var getOptionsSchema = function() {
44 return Joi.object().keys({
45 filterKeywords: Joi.array(),
46 filterLocale: Joi.string().valid('en'),
47 filterTexts: Joi.array(),
48 limit: Joi.number().integer(),
49 output: Joi.string().valid('json', 'xml', 'atom', 'rss'),
50 search: Joi.array(),
51 selector: Joi.string(),
52 url: Joi.string(),
53 contractAdjecent: Joi.boolean()
54 });
55};
56
57/**
58 * Validates options passed by the user and merges them with defaults
59 * @param options
60 * @returns {Object|*}
61 */
62var getOptions = function(options) {
63 // create default options object if none is provided or test schema
64 if (options === undefined) {
65 options = {};
66 } else {
67 Joi.validate(options, getOptionsSchema(), function(error) {
68 if (error) {
69 throw error;
70 }
71 });
72 }
73
74 // overwrite default options if options object is provided
75 return R.merge(getDefaultOptions(), options);
76};
77
78/**
79 * Finds the closest <a> tag and returns the HREF of it.
80 * Also converts relative URLs to absolute.
81 * @param domObject
82 * @param url
83 * @returns {*}
84 */
85var getCheerioClosestHref = function(domObject, url) {
86 var href = null;
87
88 // has HREF itself
89 if (domObject[0].name && domObject[0].name === 'a') {
90 href = domObject.attr('href');
91 }
92
93 // child node that has a HREF
94 else if (domObject.children('a').length > 0) {
95 href = domObject.children('a').attr('href');
96 }
97
98 // successor node that has a HREF
99 else if (domObject.find('a').length > 0) {
100 href = domObject.find('a').attr('href');
101 }
102
103 // parent that has a HREF
104 else if (domObject.closest('a').length > 0) {
105 href = domObject.closest('a').attr('href');
106 }
107
108 // adjecent node that has a HREF
109 else if (domObject.next('a').length > 0) {
110 href = domObject.next('a').attr('href');
111 }
112
113 // no valid href here, return url
114 if (href === '#') {
115 return url;
116 }
117
118 // check for relative URLs and append URL if relative
119 if (url &&
120 href &&
121 !v.isURL(href) &&
122 href.indexOf('http') !== 0) {
123 href = url + href;
124 }
125
126 // if still no href, just return url
127 if (!href) {
128 return url;
129 }
130
131 return href;
132};
133
134/**
135 * Gets the text of a cheerio DOM object.
136 * @param text
137 * @returns {String.s|*}
138 */
139var sanitizeText = function(text) {
140 return S(text)
141 .unescapeHTML()
142 .stripTags()
143 .collapseWhitespace()
144 .s
145 .trim();
146};
147
148/**
149 * Remove special characters
150 * Courtesy of Seagull: http://stackoverflow.com/a/26482650
151 * @param text
152 * @returns {string}
153 */
154var removeSpecials = function(text) {
155 var whitelist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', ' '];
156 var spacelist = ['-'];
157 var lower = text.toLowerCase();
158 var upper = text.toUpperCase();
159 var result = '';
160
161 for (var i = 0; i < lower.length; ++i) {
162 if (lower[i] !== upper[i] || whitelist.indexOf(lower[i]) > -1) {
163 result += (spacelist.indexOf(lower[i]) > -1 ? ' ' : text[i]);
164 }
165 }
166
167 return result;
168};
169
170/**
171 * Simplifies a text to make comparisons easy.
172 * @param text
173 * @returns {*}
174 */
175var simplifyText = function(text) {
176 var result;
177
178 if (R.isArrayLike(text)) {
179 return R.map(simplifyText, text);
180 }
181
182 result = removeSpecials(text).toLowerCase();
183 result = S(result).collapseWhitespace().s.trim();
184 return result;
185};
186
187/**
188 * Gets the contant of a meta tag.
189 * @param $
190 * @param name
191 * @returns {*|string|String|jQuery}
192 */
193var getCheerioMetaTag = function($, name) {
194 return $('meta').filter(function() {
195 return $(this).attr('name') === name;
196 }).map(function() {
197 return $(this).attr('content');
198 }).get().join();
199};
200
201/**
202 * Counts keyword occurences in text.
203 * @param text
204 * @param keywords
205 * @returns {Array|*}
206 */
207var countKeywords = function(text, keywords) {
208 return R.map(function(keyword) {
209 return {
210 word: keyword,
211 count: S(text).count(keyword)
212 };
213 }, keywords);
214};
215
216/**
217 * Get keywords from text.
218 * @param text
219 * @param stopWords
220 * @returns {Array|*}
221 */
222var getKeywordsFromText = function(text, stopWords) {
223 var simplifiedText = simplifyText(text);
224 var keywords = R.uniq(simplifiedText.split(' '));
225 var filtered = R.difference(keywords, stopWords);
226
227 return countKeywords(simplifiedText, filtered);
228};
229
230/**
231 * Get meta data from cheerio for building feeds.
232 * @param $
233 * @param url
234 * @returns {{
235 * author: (*|string|String|jQuery),
236 * description: (*|string|String|jQuery),
237 * url: *,
238 * time: (number|*),
239 * title: (String.s|*)
240 *}}
241 */
242var getCheerioMeta = function($, url) {
243 return {
244 author: getCheerioMetaTag($, 'author'),
245 description: getCheerioMetaTag($, 'description'),
246 url: url,
247 time: Date.now(),
248 title: sanitizeText($('title').text())
249 };
250};
251
252/**
253 * Get meta data from nuggets for building feeds.
254 * @param nuggets
255 * @returns {{
256 * url: string,
257 * time: (timestamp|*|Number|number)
258 * }}
259 */
260var getNuggetsMeta = function(nuggets) {
261 return {
262 url: nuggets[0].source,
263 time: nuggets[0].timestamp
264 };
265};
266
267/**
268 * Get all keywords from an array of nuggets.
269 * @type {Function|*}
270 */
271var getNuggetsKeywordTexts = R.pipe(
272 R.map(function(nugget) { return nugget.keywords; }),
273 R.flatten(),
274 R.map(function(keyword) { return keyword.word; }),
275 R.uniq()
276);
277
278/**
279 * Converts a batch of nuggets to a string of XML.
280 * @param batch
281 * @returns {*}
282 */
283var outputToXml = function(batch) {
284 var nuggets = R.forEach(function(nugget) {
285 nugget.temp = nugget.keywords;
286 nugget.keywords = {};
287 nugget.keywords.keyword = [];
288 nugget.keywords.keyword.push(nugget.temp);
289 delete nugget.temp;
290 }, batch.nuggets);
291
292 return js2xmlparser.parse('goldwasher', {nugget: nuggets});
293};
294
295/**
296 * Converts an array of nuggets to a feed.
297 * @param batch
298 * @param output
299 * @returns {*}
300 */
301var outputToFeed = function(batch, output) {
302 var feed = new Feed({
303 description: batch.meta.description,
304 link: batch.meta.url,
305 title: batch.meta.title,
306 id: 'goldwasher-' + batch.meta.url
307 });
308
309 R.forEach(function(keyword) {
310 feed.addCategory(keyword);
311 }, getNuggetsKeywordTexts(batch.nuggets));
312
313 R.forEach(function(nugget) {
314 feed.addItem({
315 author: [{
316 name: batch.meta.author,
317 link: batch.meta.url
318 }],
319 date: new Date(),
320 description: nugget.text,
321 guid: nugget.href,
322 link: nugget.href,
323 title: nugget.text
324 });
325 }, batch.nuggets);
326
327 return feed.render(output);
328};
329
330/**
331 * returns true if "text" contains any of the texts in "list"
332 * @param text
333 * @param list
334 * @returns {boolean}
335 */
336var textSearch = function(text, list) {
337 return !!R.filter(function(listItem) {
338 return S(text).contains(listItem);
339 }, list).length;
340};
341
342/**
343 * returns true if nugget should be removed.
344 * @param text
345 * @param stopTexts
346 * @param index
347 * @param limit
348 * @param search
349 * @returns {boolean}
350 */
351var nuggetFilter = function(text, stopTexts, search) {
352 if (R.difference([text], stopTexts).length === 0) {
353 return true;
354 }
355
356 if (search.length > 0) {
357 if (!textSearch(text, search)) {
358 return true;
359 }
360 }
361};
362
363/**
364 * Gets all stop words.
365 * @param filterKeywords
366 * @param filterLocale
367 * @returns {*}
368 */
369var getStopwords = function(filterKeywords, filterLocale) {
370 var stopWords = filterKeywords;
371 var localeJson;
372
373 if (filterLocale) {
374 localeJson = require('../stop_words/' + filterLocale + '.json');
375 stopWords = R.concat(
376 stopWords,
377 localeJson.stopWords);
378 }
379
380 stopWords = simplifyText(stopWords);
381
382 return stopWords;
383};
384
385/**
386 * Converts a cheerio object to a batch of nuggets.
387 * @param $
388 * @param options
389 * @returns {{
390 * meta: {
391 * author: (*|string|String|jQuery),
392 * description: (*|string|String|jQuery),
393 * url: *,
394 * time: (number|*),
395 * title: (String.s|*)},
396 * nuggets: Array
397 * }}
398 */
399var inputFromCheerio = function($, options) {
400 var meta = getCheerioMeta($, options.url);
401 var scraped = $(options.selector);
402 var nuggets = [];
403 var batchUuid = uuid.v1();
404 var simpleSearch = simplifyText(options.search);
405 var simpleStopTexts = simplifyText(options.filterTexts);
406 var simpleStopWords = getStopwords(
407 options.filterKeywords,
408 options.filterLocale
409 );
410
411 scraped.each(function() {
412 var nugget = {};
413 var current = $(this)[0];
414 var prev = $(this).prev()[0];
415 var next = $(this).next()[0];
416 var text = $(this).text();
417
418 if (options.contractAdjecent) {
419 if (next && next.name && next.name === current.name) {
420 return;
421 } else if (prev && prev.name && prev.name === current.name) {
422 var siblings = $(this).prevAll(current.name);
423 siblings.each(function() {
424 text = $(this).text() + ' ' + text;
425 });
426 }
427 }
428
429 text = sanitizeText(text);
430 var simpleText = simplifyText(text);
431
432 if (simpleText === '' || simpleText === null) {
433 return;
434 }
435
436 if (nuggetFilter(
437 simpleText,
438 simpleStopTexts,
439 simpleSearch
440 )) {
441 return;
442 }
443
444 nugget.source = options.url;
445 nugget.href = getCheerioClosestHref($(this), options.url);
446 nugget.tag = $(this)[0].name;
447 nugget.text = text;
448 nugget.timestamp = meta.time;
449 nugget.uuid = uuid.v1();
450 nugget.batch = batchUuid;
451 nugget.keywords = getKeywordsFromText(text, simpleStopWords);
452
453 nuggets.push(nugget);
454 });
455
456 if (options.limit) {
457 nuggets = R.take(options.limit, nuggets);
458 }
459
460 var index = 0;
461 nuggets = R.forEach(function(nugget) {
462 nugget.total = nuggets.length;
463 nugget.position = index;
464 index++;
465 }, nuggets);
466
467 return {
468 meta: meta,
469 nuggets: nuggets
470 };
471};
472
473/**
474 * Detects the type of input. Could be improved a lot.
475 * @param input
476 * @returns {string}
477 */
478var detectInputType = function(input) {
479 if (R.is(String, input)) {
480
481 // detect if XML
482 if (S(input).contains('<?xml version="1.0" encoding="UTF-8"?>')) {
483 return 'xml';
484 }
485
486 // detect if HTML. Lamest detector ever? Pretty much the default anyway.
487 if (S(input).contains('<')) {
488 return 'html';
489 }
490
491 throw new Error('Could not determine input type. (string)');
492 } else {
493
494 //if input is not string, it must be a goldwasher array or cheerio object
495 if (R.has('parseHTML', input)) {
496 return 'cheerio';
497 }
498
499 if (R.isArrayLike(input)) {
500 if (R.has('timestamp', input[0])) {
501 return 'array';
502 }
503 }
504
505 throw new Error('Could not determine input type. (cheerio/array)');
506 }
507};
508
509/**
510 * Receives html as string or cheerio DOM, along with optional options object.
511 * Outputs an array of objects in goldwasher output.
512 * @param input
513 * @param userOptions
514 * @returns {*}
515 */
516var goldwasher = function(input, userOptions) {
517 var batch;
518 var options = getOptions(userOptions);
519 options.input = detectInputType(input);
520
521 // first we get the batch of nuggets and meta data
522 if (options.input === 'array') {
523 batch = {
524 meta: getNuggetsMeta(input),
525 nuggets: input
526 };
527 } else {
528 if (options.input !== 'cheerio') {
529 batch = inputFromCheerio(cheerio.load(input), options);
530 } else {
531 batch = inputFromCheerio(input, options);
532 }
533 }
534
535 // now that we have the batch of nuggets, determine how to output them
536 if (options.output === 'xml') {
537 return outputToXml(batch);
538 }
539
540 if (options.output === 'atom') {
541 return outputToFeed(batch, 'atom-1.0');
542 }
543
544 if (options.output === 'rss') {
545 return outputToFeed(batch, 'rss-2.0');
546 }
547
548 // else default to json
549 return batch.nuggets;
550};
551
552module.exports = goldwasher;
\No newline at end of file