UNPKG

43 kBJavaScriptView Raw
1/**********************************************************************
2 node-feedparser - A robust RSS, Atom, RDF parser for node.
3 http://github.com/danmactough/node-feedparser
4 Copyright (c) 2011-2016 Dan MacTough and contributors
5 http://mact.me
6
7**********************************************************************/
8
9/**
10 * Module dependencies.
11 */
12var sax = require('sax')
13 , addressparser = require('addressparser')
14 , indexOfObject = require('array-indexofobject')
15 , util = require('util')
16 , TransformStream = require('readable-stream').Transform
17 , _ = require('../utils');
18
19/**
20 * FeedParser constructor.
21 *
22 * Exposes a duplex (transform) stream to parse a feed.
23 *
24 * Each article/post in the feed will have the following keys:
25 * - title {String}
26 * - description {String}
27 * - summary {String}
28 * - date {Date} (or null)
29 * - pubdate {Date} (or null)
30 * - link {String}
31 * - origlink {String}
32 * - author {String}
33 * - guid {String}
34 * - comments {String}
35 * - image {Object}
36 * - categories {Array}
37 * - source {Object}
38 * - enclosures {Array}
39 * - meta {Object}
40 * - Object.keys(meta):
41 * - #ns {Array} key,value pairs of each namespace declared for the feed
42 * - #type {String} one of 'atom', 'rss', 'rdf'
43 * - #version {String}
44 * - title {String}
45 * - description {String}
46 * - date {Date} (or null)
47 * - pubdate {Date} (or null)
48 * - link {String} i.e., to the website, not the feed
49 * - xmlurl {String} the canonical URL of the feed, as declared by the feed
50 * - author {String}
51 * - language {String}
52 * - image {Object}
53 * - favicon {String}
54 * - copyright {String}
55 * - generator {String}
56 * - categories {Array}
57 *
58 * @param {Object} options
59 * @api public
60 */
61function FeedParser (options) {
62 if (!(this instanceof FeedParser)) return new FeedParser(options);
63 TransformStream.call(this);
64 this._readableState.objectMode = true;
65 this._readableState.highWaterMark = 16; // max. # of output nodes buffered
66
67 this.init();
68
69 // Parse options
70 this.options = _.assign({}, options);
71 if (!('strict' in this.options)) this.options.strict = false;
72 if (!('normalize' in this.options)) this.options.normalize = true;
73 if (!('addmeta' in this.options)) this.options.addmeta = true;
74 if (!('resume_saxerror' in this.options)) this.options.resume_saxerror = true;
75 if ('MAX_BUFFER_LENGTH' in this.options) {
76 sax.MAX_BUFFER_LENGTH = this.options.MAX_BUFFER_LENGTH; // set to Infinity to have unlimited buffers
77 } else {
78 sax.MAX_BUFFER_LENGTH = 16 * 1024 * 1024; // 16M versus the 64K default
79 }
80 if (this.options.feedurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.feedurl});
81
82 // See https://github.com/isaacs/sax-js for more info
83 this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, {lowercase: true, xmlns: true });
84 this.stream.on('error', this.handleSaxError.bind(this));
85 this.stream.on('processinginstruction', this.handleProcessingInstruction.bind(this));
86 this.stream.on('opentag', this.handleOpenTag.bind(this));
87 this.stream.on('closetag',this.handleCloseTag.bind(this));
88 this.stream.on('text', this.handleText.bind(this));
89 this.stream.on('cdata', this.handleText.bind(this));
90 this.stream.on('end', this.handleEnd.bind(this));
91}
92util.inherits(FeedParser, TransformStream);
93
94/*
95 * Initializes the SAX stream
96 *
97 * Initializes the class-variables
98 */
99FeedParser.prototype.init = function (){
100 this.meta = {
101 '#ns': [],
102 '@': [],
103 '#xml': {}
104 };
105 this._emitted_meta = false;
106 this.stack = [];
107 this.xmlbase = [];
108 this.in_xhtml = false;
109 this.xhtml = {}; /* Where to store xhtml elements as associative
110 array with keys: '#' (containing the text)
111 and '#name' (containing the XML element name) */
112 this.errors = [];
113};
114
115FeedParser.prototype.handleEnd = function (){
116 // We made it to the end without throwing, but let's make sure we were actually
117 // parsing a feed
118 if (!(this.meta && this.meta['#type'])) {
119 var e = new Error('Not a feed');
120 return this.handleError(e);
121 }
122 this.push(null);
123};
124
125FeedParser.prototype.handleSaxError = function (e) {
126 this.emit('error', e);
127 if (this.options.resume_saxerror) {
128 this.resumeSaxError();
129 }
130};
131
132FeedParser.prototype.resumeSaxError = function () {
133 if (this.stream._parser) {
134 this.stream._parser.error = null;
135 this.stream._parser.resume();
136 }
137};
138
139FeedParser.prototype.handleError = function (e){
140 this.emit('error', e);
141};
142
143// parses the xml declaration, which looks like:
144// <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
145FeedParser.prototype.handleProcessingInstruction = function (node) {
146 if (node.name === 'xml') {
147 this.meta['#xml'] = node.body.trim().split(/\s+/).reduce(function (map, attr) {
148 if (attr.indexOf('=') >= 0) {
149 var parts = attr.split('=');
150 map[parts[0]] = parts[1] && parts[1].length > 2 && parts[1].match(/^.(.*?).$/)[1];
151 }
152 return map;
153 }, this.meta['#xml']);
154 }
155};
156
157FeedParser.prototype.handleOpenTag = function (node){
158 var n = {};
159 n['#name'] = node.name; // Avoid namespace collissions later...
160 n['#prefix'] = node.prefix; // The current ns prefix
161 n['#local'] = node.local; // The current element name, sans prefix
162 n['#uri'] = node.uri; // The current ns uri
163 n['@'] = {};
164 n['#'] = '';
165
166 if (Object.keys(node.attributes).length) {
167 n['@'] = this.handleAttributes(node.attributes, n['#name']);
168 }
169
170 if (this.in_xhtml && this.xhtml['#name'] != n['#name']) { // We are in an xhtml node
171 // This builds the opening tag, e.g., <div id='foo' class='bar'>
172 this.xhtml['#'] += '<'+n['#name'];
173 Object.keys(n['@']).forEach(function(name){
174 this.xhtml['#'] += ' '+ name +'="'+ n['@'][name] + '"';
175 }, this);
176 this.xhtml['#'] += '>';
177 } else if ( this.stack.length === 0 &&
178 (n['#name'] === 'rss' ||
179 (n['#local'] === 'rdf' && _.nslookup([n['#uri']], 'rdf')) ||
180 (n['#local'] === 'feed'&& _.nslookup([n['#uri']], 'atom')) ) ) {
181 Object.keys(n['@']).forEach(function(name) {
182 var o = {};
183 if (name != 'version') {
184 o[name] = n['@'][name];
185 this.meta['@'].push(o);
186 }
187 }, this);
188 switch(n['#local']) {
189 case 'rss':
190 this.meta['#type'] = 'rss';
191 this.meta['#version'] = n['@']['version'];
192 break;
193 case 'rdf':
194 this.meta['#type'] = 'rdf';
195 this.meta['#version'] = n['@']['version'] || '1.0';
196 break;
197 case 'feed':
198 this.meta['#type'] = 'atom';
199 this.meta['#version'] = n['@']['version'] || '1.0';
200 break;
201 }
202 }
203 this.stack.unshift(n);
204};
205
206FeedParser.prototype.handleCloseTag = function (el){
207 var node = {
208 '#name': el,
209 '#prefix': '',
210 '#local' : ''
211 }
212 , stdEl
213 , item
214 , baseurl
215 ;
216 var n = this.stack.shift();
217 el = el.split(':');
218
219 if (el.length > 1 && el[0] === n['#prefix']) {
220 if (_.nslookup(n['#uri'], 'atom')) {
221 node['#prefix'] = el[0];
222 node['#local'] = el.slice(1).join(':');
223 node['#type'] = 'atom';
224 } else if (_.nslookup(n['#uri'], 'rdf')) {
225 node['#prefix'] = el[0];
226 node['#local'] = el.slice(1).join(':');
227 node['#type'] = 'rdf';
228 } else {
229 node['#prefix'] = _.nsprefix(n['#uri']) || n['#prefix'];
230 node['#local'] = el.slice(1).join(':');
231 }
232 } else {
233 node['#local'] = node['#name'];
234 node['#type'] = _.nsprefix(n['#uri']) || n['#prefix'];
235 }
236 delete n['#name'];
237 delete n['#local'];
238 delete n['#prefix'];
239 delete n['#uri'];
240
241 if (this.xmlbase && this.xmlbase.length) {
242 baseurl = this.xmlbase[0]['#'];
243 }
244
245 var mayHaveResolvableUrl = (
246 (
247 (node['#local'] === 'logo' || node['#local'] === 'icon') && node['#type'] === 'atom'
248 ) ||
249 (
250 node['#local'] === 'link' // include rss:link, even though it should _never_ be a relative URL
251 )
252 );
253 if (baseurl && mayHaveResolvableUrl) {
254 // Apply xml:base to these elements as they appear
255 // rather than leaving it to the ultimate parser
256 n['#'] = _.resolve(baseurl, n['#']);
257 }
258
259 if (this.xmlbase.length && (el == this.xmlbase[0]['#name'])) {
260 void this.xmlbase.shift();
261 }
262
263 if (this.in_xhtml) {
264 if (node['#name'] == this.xhtml['#name']) { // The end of the XHTML
265
266 // Add xhtml data to the container element
267 n['#'] += this.xhtml['#'].trim();
268 // Clear xhtml nodes from the tree
269 for (var key in n) {
270 if (key != '@' && key != '#') {
271 delete n[key];
272 }
273 }
274 this.xhtml = {};
275 this.in_xhtml = false;
276 } else { // Somewhere in the middle of the XHTML
277 this.xhtml['#'] += '</' + node['#name'] + '>';
278 }
279 }
280
281 if ('#' in n) {
282 if (n['#'].match(/^\s*$/)) {
283 // Delete text nodes with nothing by whitespace
284 delete n['#'];
285 } else {
286 n['#'] = n['#'].trim();
287 if (Object.keys(n).length === 1) {
288 // If there is only one text node, hoist it
289 n = n['#'];
290 }
291 }
292 }
293
294 if (node['#name'] === 'item' ||
295 node['#name'] === 'entry' ||
296 (node['#local'] === 'item' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
297 (node['#local'] == 'entry' && (node['#prefix'] === '' || node['#type'] === 'atom'))) { // We have an article!
298
299 if (!this.meta.title) { // We haven't yet parsed all the metadata
300 _.assign(this.meta, this.handleMeta(this.stack[0], this.meta['#type'], this.options));
301 if (!this._emitted_meta) {
302 this.emit('meta', this.meta);
303 this._emitted_meta = true;
304 }
305 }
306 if (!baseurl && this.xmlbase && this.xmlbase.length) { // handleMeta was able to infer a baseurl without xml:base or options.feedurl
307 n = _.reresolve(n, this.xmlbase[0]['#']);
308 }
309 item = this.handleItem(n, this.meta['#type'], this.options);
310 if (this.options.addmeta) {
311 item.meta = this.meta;
312 }
313 if (this.meta.author && !item.author) item.author = this.meta.author;
314 this.push(item);
315 } else if (!this.meta.title && // We haven't yet parsed all the metadata
316 (node['#name'] === 'channel' ||
317 node['#name'] === 'feed' ||
318 (node['#local'] === 'channel' && (node['#prefix'] === '' || node['#type'] === 'rdf')) ||
319 (node['#local'] === 'feed' && (node['#prefix'] === '' || node['#type'] === 'atom')) ) ) {
320 _.assign(this.meta, this.handleMeta(n, this.meta['#type'], this.options));
321 if (!this._emitted_meta) {
322 this.emit('meta', this.meta);
323 this._emitted_meta = true;
324 }
325 }
326
327 if (this.stack.length > 0) {
328 if (node['#prefix'] && node['#local'] && !node['#type']) {
329 stdEl = node['#prefix'] + ':' + node['#local'];
330 } else if (node['#name'] && node['#type'] && node['#type'] !== this.meta['#type']) {
331 stdEl = node['#name'];
332 } else {
333 stdEl = node['#local'] || node['#name'];
334 }
335 if (!this.stack[0].hasOwnProperty(stdEl)) {
336 this.stack[0][stdEl] = n;
337 } else if (this.stack[0][stdEl] instanceof Array) {
338 this.stack[0][stdEl].push(n);
339 } else {
340 this.stack[0][stdEl] = [this.stack[0][stdEl], n];
341 }
342 }
343};
344
345FeedParser.prototype.handleText = function (text){
346 if (this.in_xhtml) {
347 this.xhtml['#'] += text;
348 } else {
349 if (this.stack.length) {
350 if (this.stack[0] && '#' in this.stack[0]) {
351 this.stack[0]['#'] += text;
352 } else {
353 this.stack[0]['#'] = text;
354 }
355 }
356 }
357};
358
359FeedParser.prototype.handleAttributes = function handleAttributes (attrs, el) {
360 /*
361 * Using the sax.js option { xmlns: true }
362 * attrs is an array of objects (not strings) having the following properties
363 * name - e.g., xmlns:dc or href
364 * value
365 * prefix - the first part of the name of the attribute (before the colon)
366 * local - the second part of the name of the attribute (after the colon)
367 * uri - the uri of the namespace
368 *
369 */
370
371 var basepath = ''
372 , simplifiedAttributes = {}
373 ;
374
375 if (this.xmlbase && this.xmlbase.length) {
376 basepath = this.xmlbase[0]['#'];
377 }
378
379 Object.keys(attrs).forEach(function(key){
380 var attr = attrs[key]
381 , ns = {}
382 , prefix = ''
383 ;
384 if (attr.prefix === 'xmlns') {
385 ns[attr.name] = attr.value;
386 this.meta['#ns'].push(ns);
387 }
388 // If the feed is using a non-default prefix, we'll use it, too
389 // But we force the use of the 'xml' prefix
390 if (attr.uri && attr.prefix && !_.nslookup(attr.uri, attr.prefix) || _.nslookup(attr.uri, 'xml')) {
391 prefix = ( _.nsprefix(attr.uri) || attr.prefix ) + ( attr.local ? ':' : '' );
392 }
393 if (basepath && (attr.local == 'href' || attr.local == 'src' || attr.local == 'uri')) {
394 // Apply xml:base to these elements as they appear
395 // rather than leaving it to the ultimate parser
396 attr.value = _.resolve(basepath, attr.value);
397 } else if (attr.local === 'base' && _.nslookup(attr.uri, 'xml')) {
398 // Keep track of the xml:base for the current node
399 if (basepath) {
400 attr.value = _.resolve(basepath, attr.value);
401 }
402 this.xmlbase.unshift({ '#name': el, '#': attr.value});
403 } else if (attr.name === 'type' && attr.value === 'xhtml') {
404 this.in_xhtml = true;
405 this.xhtml = {'#name': el, '#': ''};
406 }
407 simplifiedAttributes[prefix + attr.local] = attr.value ? attr.value.trim() : '';
408 }, this);
409 return simplifiedAttributes;
410};
411
412FeedParser.prototype.handleMeta = function handleMeta (node, type, options) {
413 if (!type || !node) return {};
414
415 var meta = {}
416 , normalize = !options || (options && options.normalize)
417 ;
418
419 if (normalize) {
420 ['title','description','date', 'pubdate', 'pubDate','link', 'xmlurl', 'xmlUrl','author','language','favicon','copyright','generator'].forEach(function (property){
421 meta[property] = null;
422 });
423 meta.cloud = {};
424 meta.image = {};
425 meta.categories = [];
426 }
427
428 Object.keys(node).forEach(function(name){
429 var el = node[name];
430
431 if (normalize) {
432 switch(name){
433 case('title'):
434 meta.title = _.get(el);
435 break;
436 case('description'):
437 case('subtitle'):
438 meta.description = _.get(el);
439 break;
440 case('pubdate'):
441 case('lastbuilddate'):
442 case('published'):
443 case('modified'):
444 case('updated'):
445 case('dc:date'):
446 var date = _.get(el) ? new Date(_.get(el)) : null;
447 if (!date) break;
448 if (meta.pubdate === null || name == 'pubdate' || name == 'published')
449 meta.pubdate = meta.pubDate = date;
450 if (meta.date === null || name == 'lastbuilddate' || name == 'modified' || name == 'updated')
451 meta.date = date;
452 break;
453 case('link'):
454 case('atom:link'):
455 case('atom10:link'):
456 if (Array.isArray(el)) {
457 el.forEach(function (link){
458 if (link['@']['href']) { // Atom
459 if (_.get(link['@'], 'rel')) {
460 if (link['@']['rel'] == 'alternate') {
461 if (!meta.link) meta.link = link['@']['href'];
462 }
463 else if (link['@']['rel'] == 'self') {
464 meta.xmlurl = meta.xmlUrl = link['@']['href'];
465 if (_.isAbsoluteUrl(meta.xmlurl) && this.xmlbase && this.xmlbase.length === 0) {
466 this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl });
467 this.stack[0] = _.reresolve(this.stack[0], meta.xmlurl);
468 }
469 else if (this.xmlbase && this.xmlbase.length > 0) {
470 meta.xmlurl = meta.xmlUrl = _.resolve(_.get(this.xmlbase[0], '#'), meta.xmlurl);
471 }
472 }
473 else if (link['@']['rel'] == 'hub' && !(meta.cloud.href || meta.cloud.domain)) {
474 meta.cloud.type = 'hub';
475 meta.cloud.href = link['@']['href'];
476 }
477 } else {
478 if (!meta.link) meta.link = link['@']['href'];
479 }
480 } else if (Object.keys(link['@']).length === 0) { // RSS
481 meta.link = _.get(link);
482 }
483 if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) {
484 this.xmlbase.unshift({ '#name': 'xml', '#': meta.link});
485 this.stack[0] = _.reresolve(this.stack[0], meta.link);
486 }
487 else if (this.xmlbase && this.xmlbase.length > 0) {
488 meta.link = _.resolve(_.get(this.xmlbase[0], '#'), meta.link);
489 }
490 }, this);
491 } else {
492 if (el['@']['href']) { // Atom
493 if (_.get(el['@'], 'rel')) {
494 if (el['@']['rel'] == 'alternate') {
495 if (!meta.link) meta.link = el['@']['href'];
496 }
497 else if (el['@']['rel'] == 'self') {
498 meta.xmlurl = meta.xmlUrl = el['@']['href'];
499 if (_.isAbsoluteUrl(meta.xmlurl) && this.xmlbase && this.xmlbase.length === 0) {
500 this.xmlbase.unshift({ '#name': 'xml', '#': meta.xmlurl});
501 this.stack[0] = _.reresolve(this.stack[0], meta.xmlurl);
502 }
503 else if (this.xmlbase && this.xmlbase.length > 0) {
504 meta.xmlurl = meta.xmlUrl = _.resolve(_.get(this.xmlbase[0], '#'), meta.xmlurl);
505 }
506 }
507 else if (el['@']['rel'] == 'hub' && !(meta.cloud.href || meta.cloud.domain)) {
508 meta.cloud.type = 'hub';
509 meta.cloud.href = el['@']['href'];
510 }
511 } else {
512 meta.link = el['@']['href'];
513 }
514 } else if (Object.keys(el['@']).length === 0) { // RSS
515 if (!meta.link) meta.link = _.get(el);
516 }
517 if (_.isAbsoluteUrl(meta.link) && this.xmlbase && this.xmlbase.length === 0) {
518 this.xmlbase.unshift({ '#name': 'xml', '#': meta.link});
519 this.stack[0] = _.reresolve(this.stack[0], meta.link);
520 }
521 else if (this.xmlbase && this.xmlbase.length > 0) {
522 meta.link = _.resolve(_.get(this.xmlbase[0], '#'), meta.link);
523 }
524 }
525 break;
526 case('managingeditor'):
527 case('webmaster'):
528 case('author'):
529 var author = {};
530 if (name == 'author') {
531 meta.author = _.get(el.name) || _.get(el.email) || _.get(el.uri);
532 }
533 else if (_.get(el)) {
534 author = addressparser(_.get(el))[0];
535 if (author) {
536 el['name'] = author.name;
537 el['email'] = author.address;
538 }
539 if (meta.author === null || name == 'managingeditor') {
540 meta.author = author.name || author.address || _.get(el);
541 }
542 }
543 break;
544 case('cloud'):
545 // I can't believe someone actually would put two cloud elements in their channel
546 // but it happened
547 // Nevertheless, there can be only one
548 // This will ensure that rssCloud "wins" here.
549 // If pubsubhubbub is also declared, it's still available in the link elements
550 meta.cloud = {};
551 if (Array.isArray(el)) {
552 Object.keys(el[0]['@']).forEach(function (attr) {
553 if (_.has(el[0]['@'], attr)) {
554 meta.cloud[attr] = el[0]['@'][attr];
555 }
556 });
557 }
558 else {
559 Object.keys(el['@']).forEach(function (attr) {
560 if (_.has(el['@'], attr)) {
561 meta.cloud[attr] = el['@'][attr];
562 }
563 });
564 }
565 meta.cloud.type = 'rsscloud';
566 break;
567 case('language'):
568 meta.language = _.get(el);
569 break;
570 case('image'):
571 case('logo'):
572 if (el.url)
573 meta.image.url = _.get(el.url);
574 if (el.title)
575 meta.image.title = _.get(el.title);
576 if (!meta.image.url && _.get(el))
577 meta.image.url = _.get(el);
578 break;
579 case('icon'):
580 meta.favicon = _.get(el);
581 break;
582 case('copyright'):
583 case('rights'):
584 case('dc:rights'):
585 meta.copyright = _.get(el);
586 break;
587 case('generator'):
588 meta.generator = _.get(el);
589 if (_.get(el['@'], 'version'))
590 meta.generator += (meta.generator ? ' ' : '') + 'v' + el['@'].version;
591 if (_.get(el['@'], 'uri'))
592 meta.generator += meta.generator ? ' (' + el['@'].uri + ')' : el['@'].uri;
593 break;
594 case('category'):
595 case('dc:subject'):
596 case('itunes:category'):
597 case('media:category'):
598 /* We handle all the kinds of categories within the switch loop because meta.categories
599 * is an array, unlike the other properties, and therefore can handle multiple values
600 */
601 var _category = ''
602 , _categories = []
603 ;
604 if (Array.isArray(el)) {
605 el.forEach(function (category){
606 var _categoryValue;
607 if ('category' == name && 'atom' == type) {
608 if (category['@'] && (_categoryValue = _.safeTrim(_.get(category['@'], 'term')))) {
609 meta.categories.push(_categoryValue);
610 }
611 }
612 else if ('category' == name && 'rss' == type){
613 if ((_categoryValue = _.safeTrim(_.get(category)))) {
614 meta.categories.push(_categoryValue);
615 }
616 }
617 else if ('dc:subject' == name && (_categoryValue = _.safeTrim(_.get(category)))) {
618 _categories = _categoryValue.split(' ').map(function (cat){ return cat.trim(); });
619 if (_categories.length) {
620 meta.categories = meta.categories.concat(_categories);
621 }
622 }
623 else if ('itunes:category' == name) {
624 if (category['@'] && _.safeTrim(_.get(category['@'], 'text'))) _category = _.safeTrim(_.get(category['@'], 'text'));
625 if (category[name]) {
626 if (Array.isArray(category[name])) {
627 category[name].forEach(function (subcategory){
628 var _subcategoryValue;
629 if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) {
630 meta.categories.push(_category + '/' + _subcategoryValue);
631 }
632 });
633 }
634 else if (category[name]['@'] && (_categoryValue = _.safeTrim(_.get(category[name]['@'], 'text')))) {
635 meta.categories.push(_category + '/' + _categoryValue);
636 }
637 }
638 else if (_category) {
639 meta.categories.push(_category);
640 }
641 }
642 else if ('media:category' == name && (_categoryValue = _.safeTrim(_.get(category)))) {
643 meta.categories.push(_categoryValue);
644 }
645 });
646 } else {
647 if ('category' == name && 'atom' == type) {
648 if ((_category = _.safeTrim(_.get(el['@'], 'term')))) {
649 meta.categories.push(_category);
650 }
651 }
652 else if ('category' == name && 'rss' == type) {
653 if ((_category = _.safeTrim(_.get(el)))) {
654 meta.categories.push(_category);
655 }
656 }
657 else if ('dc:subject' == name && (_category = _.safeTrim(_.get(el)))) {
658 _categories = _category.split(' ').map(function (cat){ return cat.trim(); });
659 if (_categories.length) {
660 meta.categories = meta.categories.concat(_categories);
661 }
662 }
663 else if ('itunes:category' == name) {
664 if (el['@'] && _.safeTrim(_.get(el['@'], 'text'))) _category = _.safeTrim(_.get(el['@'], 'text'));
665 if (el[name]) {
666 if (Array.isArray(el[name])) {
667 el[name].forEach(function (subcategory){
668 var _subcategoryValue;
669 if (subcategory['@'] && (_subcategoryValue = _.safeTrim(_.get(subcategory['@'], 'text')))) {
670 meta.categories.push(_category + '/' + _subcategoryValue);
671 }
672 });
673 }
674 else if (el[name]['@'] && (_category = _.safeTrim(_.get(el[name]['@'], 'text')))) {
675 meta.categories.push(_category + '/' + _category);
676 }
677 }
678 else if (_category) {
679 meta.categories.push(_category);
680 }
681 }
682 else if ('media:category' == name && (_category = _.safeTrim(_.get(el)))) {
683 meta.categories.push(_.get(el));
684 }
685 }
686 break;
687 } // switch end
688 }
689 // Fill with all native other namespaced properties
690 if (name.indexOf('#') !== 0) {
691 if (~name.indexOf(':')) meta[name] = el;
692 else meta[type + ':' + name] = el;
693 }
694 }, this); // forEach end
695
696 if (normalize) {
697 if (!meta.description) {
698 if (node['itunes:summary']) meta.description = _.get(node['itunes:summary']);
699 else if (node['tagline']) meta.description = _.get(node['tagline']);
700 }
701 if (!meta.author) {
702 if (node['itunes:author']) meta.author = _.get(node['itunes:author']);
703 else if (node['itunes:owner'] && node['itunes:owner']['itunes:name']) meta.author = _.get(node['itunes:owner']['itunes:name']);
704 else if (node['dc:creator']) meta.author = _.get(node['dc:creator']);
705 else if (node['dc:publisher']) meta.author = _.get(node['dc:publisher']);
706 }
707 if (!meta.language) {
708 if (node['@'] && node['@']['xml:lang']) meta.language = _.get(node['@'], 'xml:lang');
709 else if (node['dc:language']) meta.language = _.get(node['dc:language']);
710 }
711 if (!meta.image.url) {
712 if (node['itunes:image']) meta.image.url = _.get(node['itunes:image']['@'], 'href');
713 else if (node['media:thumbnail']) {
714 if (Array.isArray(node['media:thumbnail'])) {
715 node['media:thumbnail'] = node['media:thumbnail'][0];
716 }
717 meta.image.url = _.get(node['media:thumbnail']['@'], 'url');
718 }
719 }
720 if (!meta.copyright) {
721 if (node['media:copyright']) meta.copyright = _.get(node['media:copyright']);
722 else if (node['dc:rights']) meta.copyright = _.get(node['dc:rights']);
723 else if (node['creativecommons:license']) meta.copyright = _.get(node['creativecommons:license']);
724 else if (node['cc:license']) {
725 if (Array.isArray(node['cc:license']) && node['cc:license'][0]['@'] && node['cc:license'][0]['@']['rdf:resource']) {
726 meta.copyright = _.get(node['cc:license'][0]['@'], 'rdf:resource');
727 } else if (node['cc:license']['@'] && node['cc:license']['@']['rdf:resource']) {
728 meta.copyright = _.get(node['cc:license']['@'], 'rdf:resource');
729 }
730 }
731 }
732 if (!meta.generator) {
733 if (node['admin:generatoragent']) {
734 if (Array.isArray(node['admin:generatoragent']) && node['admin:generatoragent'][0]['@'] && node['admin:generatoragent'][0]['@']['rdf:resource']) {
735 meta.generator = _.get(node['admin:generatoragent'][0]['@'], 'rdf:resource');
736 } else if (node['admin:generatoragent']['@'] && node['admin:generatoragent']['@']['rdf:resource']) {
737 meta.generator = _.get(node['admin:generatoragent']['@'], 'rdf:resource');
738 }
739 }
740 }
741 if (meta.categories.length) {
742 meta.categories = _.uniq(meta.categories);
743 }
744 if (!meta.link) {
745 if (meta['atom:id'] && _.get(meta['atom:id']) && /^https?:/.test(_.get(meta['atom:id']))) {
746 meta.link = _.get(meta['atom:id']);
747 }
748 }
749 if (!meta.xmlurl && this.options.feedurl) {
750 meta.xmlurl = meta.xmlUrl = this.options.feedurl;
751 }
752 meta.title = meta.title && _.stripHtml(meta.title);
753 meta.description = meta.description && _.stripHtml(meta.description);
754 }
755
756 return meta;
757};
758
759FeedParser.prototype.handleItem = function handleItem (node, type, options){
760 if (!type || !node) return {};
761
762 var item = {}
763 , normalize = !options || (options && options.normalize)
764 ;
765
766 if (normalize) {
767 ['title','description','summary','date','pubdate','pubDate','link','guid','author','comments', 'origlink'].forEach(function (property){
768 item[property] = null;
769 });
770 item.image = {};
771 item.source = {};
772 item.categories = [];
773 item.enclosures = [];
774 }
775
776 Object.keys(node).forEach(function(name){
777 var el = node[name]
778 , attrs = _.get(el, '@')
779 , enclosure;
780 if (normalize) {
781 switch(name){
782 case('title'):
783 item.title = _.get(el);
784 break;
785 case('description'):
786 case('summary'):
787 item.summary = _.get(el);
788 if (!item.description) item.description = _.get(el);
789 break;
790 case('content'):
791 case('content:encoded'):
792 item.description = _.get(el);
793 break;
794 case('pubdate'):
795 case('published'):
796 case('issued'):
797 case('modified'):
798 case('updated'):
799 case('dc:date'):
800 var date = _.get(el) ? new Date(_.get(el)) : null;
801 if (!date) break;
802 if (item.pubdate === null || name == 'pubdate' || name == 'published' || name == 'issued')
803 item.pubdate = item.pubDate = date;
804 if (item.date === null || name == 'modified' || name == 'updated')
805 item.date = date;
806 break;
807 case('link'):
808 if (Array.isArray(el)) {
809 el.forEach(function (link){
810 if (link['@']['href']) { // Atom
811 if (_.get(link['@'], 'rel')) {
812 if (link['@']['rel'] == 'canonical') item.origlink = link['@']['href'];
813 if (link['@']['rel'] == 'alternate') item.link = link['@']['href'];
814 if (link['@']['rel'] == 'self' && !item.link) item.link = link['@']['href'];
815 if (link['@']['rel'] == 'replies') item.comments = link['@']['href'];
816 if (link['@']['rel'] == 'enclosure') {
817 enclosure = {};
818 enclosure.url = link['@']['href'];
819 enclosure.type = _.get(link['@'], 'type');
820 enclosure.length = _.get(link['@'], 'length');
821 if (indexOfObject(item.enclosures, enclosure, ['url', 'type']) === -1) {
822 item.enclosures.push(enclosure);
823 }
824 }
825 } else {
826 item.link = link['@']['href'];
827 }
828 } else if (Object.keys(link['@']).length === 0) { // RSS
829 if (!item.link) item.link = _.get(link);
830 }
831 });
832 } else {
833 if (el['@']['href']) { // Atom
834 if (_.get(el['@'], 'rel')) {
835 if (el['@']['rel'] == 'canonical') item.origlink = el['@']['href'];
836 if (el['@']['rel'] == 'alternate') item.link = el['@']['href'];
837 if (el['@']['rel'] == 'self' && !item.link) item.link = el['@']['href'];
838 if (el['@']['rel'] == 'replies') item.comments = el['@']['href'];
839 if (el['@']['rel'] == 'enclosure') {
840 enclosure = {};
841 enclosure.url = el['@']['href'];
842 enclosure.type = _.get(el['@'], 'type');
843 enclosure.length = _.get(el['@'], 'length');
844 if (indexOfObject(item.enclosures, enclosure, ['url', 'type']) === -1) {
845 item.enclosures.push(enclosure);
846 }
847 }
848 } else {
849 item.link = el['@']['href'];
850 }
851 } else if (Object.keys(el['@']).length === 0) { // RSS
852 if (!item.link) item.link = _.get(el);
853 }
854 }
855 if (!item.guid) item.guid = item.link;
856 break;
857 case('guid'):
858 case('id'):
859 item.guid = _.get(el);
860 // http://cyber.law.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
861 // If the guid element has an attribute named "isPermaLink" with a value
862 // of true, the reader may assume that it is a permalink to the item,
863 // that is, a url that can be opened in a Web browser, that points to
864 // the full item described by the <item> element.
865 // isPermaLink is optional, its default value is true. If its value is
866 // false, the guid may not be assumed to be a url, or a url to anything
867 // in particular.
868 if (item.guid && type == 'rss' && name == 'guid' && !(attrs.ispermalink && attrs.ispermalink.match(/false/i))) {
869 item.permalink = item.guid;
870 }
871 break;
872 case('author'):
873 var author = {};
874 if (_.get(el)) { // RSS
875 author = addressparser(_.get(el))[0];
876 if (author) {
877 el['name'] = author.name;
878 el['email'] = author.address;
879 item.author = author.name || author.address;
880 }
881 // addressparser failed
882 else {
883 item.author = _.get(el);
884 }
885 } else {
886 item.author = _.get(el.name) || _.get(el.email) || _.get(el.uri);
887 }
888 break;
889 case('dc:creator'):
890 item.author = _.get(el);
891 break;
892 case('comments'):
893 item.comments = _.get(el);
894 break;
895 case('source'):
896 if ('rss' == type) {
897 item.source['title'] = _.get(el);
898 item.source['url'] = _.get(el['@'], 'url');
899 } else if ('atom' == type) {
900 if (el.title && _.get(el.title))
901 item.source['title'] = _.get(el.title);
902 if (el.link && _.get(el.link['@'], 'href'))
903 item.source['url'] = _.get(el.link['@'], 'href');
904 }
905 if (item.source['url'] && !this.meta.xmlurl) {
906 this.meta.xmlurl = this.meta.xmlUrl = item.source['url'];
907 if (_.isAbsoluteUrl(item.source['url']) && this.xmlbase && this.xmlbase.length === 0) {
908 this.xmlbase.unshift({ '#name': 'xml', '#': item.source['url']});
909 this.stack[0] = _.reresolve(this.stack[0], item.source['url']);
910 }
911 else if (this.xmlbase && this.xmlbase.length > 0) {
912 this.meta.xmlurl = this.meta.xmlUrl = item.source['url'] = _.resolve(_.get(this.xmlbase[0], '#'), item.source['url']);
913 }
914 }
915 break;
916 case('enclosure'):
917 if (Array.isArray(el)) {
918 el.forEach(function (enc){
919 enclosure = {};
920 enclosure.url = _.get(enc['@'], 'url');
921 enclosure.type = _.get(enc['@'], 'type');
922 enclosure.length = _.get(enc['@'], 'length');
923 if (~indexOfObject(item.enclosures, enclosure, ['url', 'type'])) {
924 item.enclosures.splice(indexOfObject(item.enclosures, enclosure, ['url', 'type']), 1, enclosure);
925 } else {
926 item.enclosures.push(enclosure);
927 }
928 });
929 } else {
930 enclosure = {};
931 enclosure.url = _.get(el['@'], 'url');
932 enclosure.type = _.get(el['@'], 'type');
933 enclosure.length = _.get(el['@'], 'length');
934 if (~indexOfObject(item.enclosures, enclosure, ['url', 'type'])) {
935 item.enclosures.splice(indexOfObject(item.enclosures, enclosure, ['url', 'type']), 1, enclosure);
936 } else {
937 item.enclosures.push(enclosure);
938 }
939 }
940 break;
941 case('media:content'):
942 var optionalAttributes = ['bitrate', 'framerate', 'samplingrate', 'duration', 'height', 'width'];
943 if (Array.isArray(el)) {
944 el.forEach(function (enc){
945 enclosure = {};
946 enclosure.url = _.get(enc['@'], 'url');
947 enclosure.type = _.get(enc['@'], 'type') || _.get(enc['@'], 'medium');
948 enclosure.length = _.get(enc['@'], 'filesize');
949 var index = indexOfObject(item.enclosures, enclosure, ['url', 'type']);
950 if (index !== -1) {
951 enclosure = item.enclosures[index];
952 }
953 optionalAttributes.forEach(function (attribute) {
954 if (!enclosure[attribute] && _.get(enc['@'], attribute)) {
955 enclosure[attribute] = _.get(enc['@'], attribute);
956 }
957 });
958 if (index === -1) {
959 item.enclosures.push(enclosure);
960 }
961 });
962 } else {
963 enclosure = {};
964 enclosure.url = _.get(el['@'], 'url');
965 enclosure.type = _.get(el['@'], 'type') || _.get(el['@'], 'medium');
966 enclosure.length = _.get(el['@'], 'filesize');
967 var index = indexOfObject(item.enclosures, enclosure, ['url', 'type']);
968 if (index !== -1) {
969 enclosure = item.enclosures[index];
970 }
971 optionalAttributes.forEach(function (attribute) {
972 if (!enclosure[attribute] && _.get(el['@'], attribute)) {
973 enclosure[attribute] = _.get(el['@'], attribute);
974 }
975 });
976 if (index === -1) {
977 item.enclosures.push(enclosure);
978 }
979 }
980 break;
981 case('enc:enclosure'): // Can't find this in use for an example to debug. Only example found does not comply with the spec -- can't code THAT!
982 break;
983 case('category'):
984 case('dc:subject'):
985 case('itunes:category'):
986 case('media:category'):
987 /* We handle all the kinds of categories within the switch loop because item.categories
988 * is an array, unlike the other properties, and therefore can handle multiple values
989 */
990 var _category = ''
991 , _categories = []
992 ;
993 if (Array.isArray(el)) {
994 el.forEach(function (category){
995 if ('category' == name && 'atom' == type) {
996 if (category['@'] && _.get(category['@'], 'term')) item.categories.push(_.get(category['@'], 'term'));
997 } else if ('category' == name && _.get(category) && 'rss' == type) {
998 item.categories.push(_.get(category).trim());
999 } else if ('dc:subject' == name && _.get(category)) {
1000 _categories = _.get(category).split(' ').map(function (cat){ return cat.trim(); });
1001 if (_categories.length) item.categories = item.categories.concat(_categories);
1002 } else if ('itunes:category' == name) {
1003 if (category['@'] && _.get(category['@'], 'text')) _category = _.get(category['@'], 'text');
1004 if (category[name]) {
1005 if (Array.isArray(category[name])) {
1006 category[name].forEach(function (subcategory){
1007 if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text'));
1008 });
1009 } else {
1010 if (category[name]['@'] && _.get(category[name]['@'], 'text'))
1011 item.categories.push(_category + '/' + _.get(category[name]['@'], 'text'));
1012 }
1013 } else {
1014 item.categories.push(_category);
1015 }
1016 } else if ('media:category' == name) {
1017 item.categories.push(_.get(category));
1018 }
1019 });
1020 } else {
1021 if ('category' == name && 'atom' == type) {
1022 if (_.get(el['@'], 'term')) item.categories.push(_.get(el['@'], 'term'));
1023 } else if ('category' == name && _.get(el) && 'rss' == type) {
1024 item.categories.push(_.get(el).trim());
1025 } else if ('dc:subject' == name && _.get(el)) {
1026 _categories = _.get(el).split(' ').map(function (cat){ return cat.trim(); });
1027 if (_categories.length) item.categories = item.categories.concat(_categories);
1028 } else if ('itunes:category' == name) {
1029 if (el['@'] && _.get(el['@'], 'text')) _category = _.get(el['@'], 'text');
1030 if (el[name]) {
1031 if (Array.isArray(el[name])) {
1032 el[name].forEach(function (subcategory){
1033 if (subcategory['@'] && _.get(subcategory['@'], 'text')) item.categories.push(_category + '/' + _.get(subcategory['@'], 'text'));
1034 });
1035 } else {
1036 if (el[name]['@'] && _.get(el[name]['@'], 'text'))
1037 item.categories.push(_category + '/' + _.get(el[name]['@'], 'text'));
1038 }
1039 } else {
1040 item.categories.push(_category);
1041 }
1042 } else if ('media:category' == name) {
1043 item.categories.push(_.get(el));
1044 }
1045 }
1046 break;
1047 case('feedburner:origlink'):
1048 case('pheedo:origlink'):
1049 if (!item.origlink) {
1050 item.origlink = _.get(el);
1051 }
1052 break;
1053 } // switch end
1054 }
1055 // Fill with all native other namespaced properties
1056 if (name.indexOf('#') !== 0) {
1057 if (~name.indexOf(':')) item[name] = el;
1058 else item[type + ':' + name] = el;
1059 }
1060 }, this); // forEach end
1061
1062 if (normalize) {
1063 if (!item.description) {
1064 if (node['itunes:summary']) item.description = _.get(node['itunes:summary']);
1065 }
1066 if (!item.author) {
1067 if (node['itunes:author']) item.author = _.get(node['itunes:author']);
1068 else if (node['itunes:owner'] && node['itunes:owner']['itunes:name']) item.author = _.get(node['itunes:owner']['itunes:name']);
1069 else if (node['dc:publisher']) item.author = _.get(node['dc:publisher']);
1070 }
1071 if (!item.image.url) {
1072 if (node['itunes:image']) item.image.url = _.get(node['itunes:image']['@'], 'href');
1073 else if (node['media:thumbnail']) {
1074 if (Array.isArray(node['media:thumbnail'])) {
1075 item.image.url = _.get(node['media:thumbnail'][0]['@'], 'url');
1076 } else {
1077 item.image.url = _.get(node['media:thumbnail']['@'], 'url');
1078 }
1079 }
1080 else if (node['media:content'] && node['media:content']['media:thumbnail']) item.image.url = _.get(node['media:content']['media:thumbnail']['@'], 'url');
1081 else if (node['media:group'] && node['media:group']['media:thumbnail']) item.image.url = _.get(node['media:group']['media:thumbnail']['@'], 'url');
1082 else if (node['media:group'] && node['media:group']['media:content'] && node['media:group']['media:content']['media:thumbnail']) item.image.url = _.get(node['media:group']['media:content']['media:thumbnail']['@'], 'url');
1083 else if (node['g:image_link']) item.image.url = _.get(node['g:image_link']);
1084 }
1085 if (item.categories.length) {
1086 item.categories = _.uniq(item.categories);
1087 }
1088 if (!item.link) {
1089 if (item.guid && /^https?:/.test(item.guid)) {
1090 item.link = item.guid;
1091 }
1092 }
1093 item.title = item.title && _.stripHtml(item.title);
1094 }
1095 return item;
1096};
1097
1098// Naive Stream API
1099FeedParser.prototype._transform = function (data, encoding, done) {
1100 try {
1101 this.stream.write(data);
1102 done();
1103 }
1104 catch (e) {
1105 done(e);
1106 this.push(null); // Manually trigger and end, since we can't reliably do any more parsing
1107 }
1108};
1109
1110FeedParser.prototype._flush = function (done) {
1111 try {
1112 this.stream.end();
1113 done();
1114 }
1115 catch (e) {
1116 done(e);
1117 }
1118};
1119
1120exports = module.exports = FeedParser;