UNPKG

17.6 kBJavaScriptView Raw
1var moment = require('moment');
2var async = require('async');
3var _ = require('@sailshq/lodash');
4var fs = require('fs');
5var url = require('url');
6
7var defaultLocale = 'default';
8
9module.exports = {
10
11 // Cache sitemaps for 1 hour by default. Depending on pagerank
12 // Google may look at your sitemap somewhere between daily and
13 // monthly, so don't get your hopes up too far about changing this
14
15 cacheLifetime: 60 * 60,
16
17 piecesPerBatch: 100,
18
19 moogBundle: {
20 modules: [ 'apostrophe-site-map-custom-pages', 'apostrophe-site-map-pieces' ],
21 directory: 'lib/modules'
22 },
23
24 afterConstruct: function(self) {
25 self.apos.tasks.add(self.__meta.name, 'map', self.mapTask);
26 self.apos.tasks.add(self.__meta.name, 'clear', self.clearTask);
27 self.addRoutes();
28 self.enableCache();
29 },
30
31 construct: function(self, options) {
32
33 self.caching = true;
34
35 self.cacheLifetime = options.cacheLifetime;
36
37 self.piecesPerBatch = options.piecesPerBatch;
38
39 self.baseUrl = options.baseUrl || self.apos.baseUrl;
40
41 self.clearTask = function(apos, argv, callback) {
42 // Just forget the current sitemaps to make room
43 // for regeneration on the next request
44 return self.cache.clear(callback);
45 };
46
47 self.mapTask = function(apos, argv, callback) {
48 if (argv['update-cache']) {
49 self.caching = true;
50 } else {
51 self.caching = false;
52 }
53
54 if (!self.baseUrl) {
55 return callback(new Error(
56 'You must specify the top-level baseUrl option when configuring Apostrophe\n' +
57 'to use this task. Example: baseUrl: "http://mycompany.com"\n\n' +
58 'Note there is NO TRAILING SLASH.\n\n' +
59 'Usually you will only do this in data/local.js, on production.'
60 ));
61 }
62
63 return self.map(callback);
64 };
65
66 self.map = function(callback) {
67
68 self.workflow = self.apos.modules['apostrophe-workflow'];
69
70 var argv = self.apos.argv;
71
72 if (self.caching) {
73 self.cacheOutput = [];
74 }
75 return async.series([
76 lock,
77 init,
78 map,
79 hreflang,
80 write,
81 unlock
82 ], callback);
83
84 function lock(callback) {
85 return self.apos.locks.lock('apostrophe-site-map', callback);
86 }
87
88 function init(callback) {
89 self.format = argv.format || options.format || 'xml';
90
91 self.indent = (typeof(argv.indent) !== 'undefined') ? argv.indent : options.indent;
92
93 self.excludeTypes = options.excludeTypes || [];
94
95 if (argv['exclude-types']) {
96 self.excludeTypes = self.excludeTypes.concat(argv['exclude-types'].split(','));
97 }
98
99 self.perLocale = options.perLocale || argv['per-locale'];
100 // Exception: plaintext sitemaps and sitemap indexes don't go
101 // together, so we can presume that if they explicitly ask
102 // for plaintext they are just doing content strategy and we
103 // should produce a single report
104 if (self.format === 'text') {
105 self.perLocale = false;
106 }
107 return callback(null);
108 }
109
110 function map(callback) {
111 self.maps = {};
112 self.today = moment().format('YYYY-MM-DD');
113
114 var locales = [ defaultLocale ];
115
116 if (self.workflow) {
117 locales = _.filter(_.keys(self.workflow.locales), function(locale) {
118 return !locale.match(/-draft$/) && !self.workflow.locales[locale].private;
119 });
120 }
121
122 return async.eachSeries(locales, function(locale, callback) {
123 var req = self.apos.tasks.getAnonReq({ locale: locale });
124 req.locale = locale;
125 return async.series([
126 _.partial(self.getPages, req, locale),
127 _.partial(self.getPieces, req, locale),
128 function(callback) {
129 if (self.custom.length === 1) {
130 return self.custom(callback);
131 } else {
132 return self.custom(req, locale, callback);
133 }
134 }
135 ], callback);
136 }, function(err) {
137 if (err) {
138 return callback(err);
139 }
140 return callback(null);
141 });
142 }
143
144 function hreflang(callback) {
145
146 var alternativesByGuid = {};
147
148 each(function(entry) {
149 if (!alternativesByGuid[entry.url.workflowGuid]) {
150 alternativesByGuid[entry.url.workflowGuid] = [];
151 }
152 alternativesByGuid[entry.url.workflowGuid].push(entry);
153 });
154
155 each(function(entry) {
156 if (self.workflow) {
157 entry.url['xhtml:link'] = [{
158 _attributes: {
159 rel: 'alternate',
160 hreflang: entry.url.workflowLocale,
161 href: entry.url.loc
162 }
163 }];
164 }
165 var alternatives = alternativesByGuid[entry.url.workflowGuid];
166 _.each(alternatives, function(alternative) {
167 if (alternative === entry) {
168 return;
169 }
170 entry.url['xhtml:link'].push({
171 _attributes: {
172 rel: 'alternate',
173 hreflang: alternative.url.workflowLocale,
174 href: alternative.url.loc
175 }
176 });
177 });
178 });
179
180 each(function(entry) {
181 delete entry.url.workflowLocale;
182 delete entry.url.workflowGuid;
183 }, true);
184
185 return setImmediate(callback);
186
187 function each(iterator, ignoreWorkflow) {
188 _.each(self.maps, function(map) {
189 _.each(map, function(entry) {
190 if (typeof(entry) !== 'object') {
191 return;
192 }
193
194 if (!entry.url.workflowGuid && !ignoreWorkflow) {
195 return;
196 }
197 iterator(entry);
198 });
199 });
200 }
201
202 }
203
204 function write(callback) {
205 return self.writeSitemap(callback);
206 }
207
208 function unlock(callback) {
209 return self.apos.locks.unlock('apostrophe-site-map', callback);
210 }
211 };
212
213 self.getPages = function(req, locale, callback) {
214 return self.apos.pages.find(req).areas(false).joins(false).sort({ level: 1, rank: 1 }).toArray(function(err, pages) {
215 if (err) {
216 return callback(err);
217 }
218 _.each(pages, self.output);
219 return callback(null);
220 });
221 };
222
223 self.getPieces = function(req, locale, callback) {
224 var modules = _.filter(self.apos.modules, function(module, name) {
225 return _.find(module.__meta.chain, function(entry) {
226 return entry.name === 'apostrophe-pieces';
227 });
228 });
229 return async.eachSeries(modules, function(module, callback) {
230 if (_.includes(self.excludeTypes, module.name)) {
231 return setImmediate(callback);
232 }
233 // Paginate through 100 (by default) at a time to
234 // avoid slamming memory
235 var done = false;
236 var skip = 0;
237 return async.whilst(
238 function() { return !done; },
239 function(callback) {
240 return self.findPieces(req, module).skip(skip).limit(self.piecesPerBatch).toArray(function(err, pieces) {
241 _.each(pieces, function(piece) {
242 if (!piece._url) {
243 // This one has no page to be viewed on
244 return;
245 }
246 // Results in a reasonable priority relative
247 // to regular pages
248 piece.level = 3;
249 // Future events are interesting,
250 // past events are boring
251 if (piece.startDate) {
252 if (piece.startDate > self.today) {
253 piece.level--;
254 } else {
255 piece.level++;
256 }
257 }
258 self.output(piece);
259 });
260 if (!pieces.length) {
261 done = true;
262 } else {
263 skip += pieces.length;
264 }
265 return callback(null);
266 });
267 }, callback);
268 }, callback);
269 };
270
271 self.writeSitemap = function(callback) {
272 if (!self.perLocale) {
273 // Simple single-file sitemap
274 self.file = self.caching ? 'sitemap.xml' : (self.apos.argv.file || '/dev/stdout');
275 var map = _.map(_.keys(self.maps), function(locale) {
276 return _.map(self.maps[locale], self.stringify).join('\n');
277 }).join('\n');
278 self.writeMap(self.file, map);
279 } else {
280 // They should be broken down by host,
281 // in which case we automatically place them
282 // in public/sitemaps in a certain naming pattern
283 self.ensureDir('sitemaps');
284 _.each(self.maps, function(map, key) {
285 var extension = (self.format === 'xml') ? 'xml' : 'txt';
286 map = _.map(map, self.stringify).join('\n');
287 self.writeMap('sitemaps/' + key + '.' + extension, map);
288 });
289 self.writeIndex();
290 }
291 if (self.caching) {
292 return self.writeToCache(callback);
293 }
294 return callback(null);
295 };
296
297 // If `value` is not an object, it is returned as-is,
298 // or with < & > escaped if `self.format` is `xml`.
299 //
300 // If it is an object, it is converted to XML elements,
301 // one for each property; they may have sub-elements if
302 // the properties contain objects. The _attributes
303 // property is converted to attributes. Array
304 // properties are converted to a series of elements.
305 //
306 // TODO: this is clearly yak-shaving, but the data format
307 // is nice. See if there's another library that takes the same
308 // or substantially the same format.
309
310 self.stringify = function(value) {
311 if (Array.isArray(value) && (self.format !== 'xml')) {
312 return value.join('');
313 }
314 if (typeof(value) !== 'object') {
315 if (self.format === 'xml') {
316 return self.apos.utils.escapeHtml(value);
317 }
318 return value;
319 }
320 var xml = '';
321 _.each(value, function(v, k) {
322 if (k === '_attributes') {
323 return;
324 }
325 if (Array.isArray(v)) {
326 _.each(v, function(el) {
327 element(k, el);
328 });
329 } else {
330 element(k, v);
331 }
332 function element(k, v) {
333 xml += '<' + k;
334 if (v && v._attributes) {
335 _.each(v._attributes, function(av, a) {
336 xml += ' ' + a + '="' + self.apos.utils.escapeHtml(av) + '"';
337 });
338 }
339 xml += '>';
340 xml += self.stringify(v || '');
341 xml += '</' + k + '>\n';
342 }
343 });
344 return xml;
345 };
346
347 self.ensureDir = function(dir) {
348 if (!self.caching) {
349 dir = self.apos.rootDir + '/public/' + dir;
350 try {
351 fs.mkdirSync(dir);
352 } catch (e) {
353 // exists
354 }
355 }
356 };
357
358 self.writeIndex = function() {
359 var now = new Date();
360 if (!self.baseUrl) {
361 throw new Error(
362 'You must specify the top-level baseUrl option when configuring Apostrophe\n' +
363 'to use sitemap indexes. Example: baseUrl: "http://mycompany.com"\n\n' +
364 'Note there is NO TRAILING SLASH.\n\n' +
365 'Usually you will override this in data/local.js, on production.'
366 );
367 }
368 self.writeFile('sitemaps/index.xml',
369
370 '<?xml version="1.0" encoding="UTF-8"?>\n' +
371 '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"' +
372 ' xmlns:xhtml="http://www.w3.org/1999/xhtml">\n' +
373 _.map(_.keys(self.maps), function(key) {
374 var map = self.maps[key];
375 var sitemap = ' <sitemap>\n' +
376 ' <loc>' + self.baseUrl + self.apos.prefix + '/sitemaps/' + key + '.xml'
377 + '</loc>\n' +
378 ' <lastmod>' + now.toISOString() + '</lastmod>\n' +
379 ' </sitemap>\n';
380 return sitemap;
381 }).join('') +
382 '</sitemapindex>\n'
383 );
384
385 };
386
387 self.writeMap = function(file, map) {
388 if (self.format === 'xml') {
389 self.writeXmlMap(file, map);
390 } else {
391 self.writeFile(file, map);
392 }
393 };
394
395 self.writeXmlMap = function(file, map) {
396 self.writeFile(file,
397 '<?xml version="1.0" encoding="UTF-8"?>\n' +
398 '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"' +
399 ' xmlns:xhtml="http://www.w3.org/1999/xhtml">\n' +
400 map +
401 '</urlset>\n'
402 );
403 };
404
405 self.writeFile = function(filename, s) {
406 if (!self.caching) {
407 filename = require('path').resolve(self.apos.rootDir + '/public', filename);
408 if (filename === '/dev/stdout') {
409 // Strange bug on MacOS when using writeFileSync with /dev/stdout
410 fs.writeSync(1, s);
411 } else {
412 fs.writeFileSync(filename, s);
413 }
414 } else {
415 self.cacheOutput.push({
416 filename: filename,
417 data: s,
418 createdAt: new Date()
419 });
420 }
421 };
422
423 self.writeToCache = function(callback) {
424 return async.series([
425 clear,
426 insert
427 ], callback);
428
429 function clear(callback) {
430 return self.cache.clear(callback);
431 }
432
433 function insert(callback) {
434 return async.eachSeries(self.cacheOutput, function(doc, callback) {
435 return self.cache.set(doc.filename, doc, self.cacheLifetime, callback);
436 }, callback);
437 }
438 };
439
440 // Override to do more. You can invoke `self.output(doc)`
441 // from here as many times as you like.
442
443 self.custom = function(req, locale, callback) {
444 return callback(null);
445 };
446
447 self.findPieces = function(req, module) {
448 return module.find(req).published(true).joins(false).areas(false);
449 };
450
451 // Output the sitemap entry for the given doc, including its children if any.
452 // The entry is buffered for output as part of the map for the appropriate
453 // locale. If the workflow module is not in use they all accumulate together
454 // for a "default" locale. Content not subject to workflow is grouped with
455 // the "default" locale. If workflow is active and the locale is not configured
456 // or is marked private, the output is discarded.
457
458 self.output = function(page) {
459 var locale = page.workflowLocale || defaultLocale;
460 if (self.workflow) {
461 if (!self.workflow.locales[locale]) {
462 return;
463 }
464 if (self.workflow.locales[locale].private) {
465 return;
466 }
467 }
468
469 if (!_.includes(self.excludeTypes, page.type)) {
470 var url;
471
472 if (self.format === 'text') {
473 if (self.indent) {
474 var i;
475
476 for (i = 0; (i < page.level); i++) {
477 self.write(locale, ' ');
478 }
479
480 self.write(locale, page._url + '\n');
481 }
482 } else {
483 url = page._url;
484 var priority = (page.level < 10) ? (1.0 - page.level / 10) : 0.1;
485
486 if (typeof (page.siteMapPriority) === 'number') {
487 priority = page.siteMapPriority;
488 }
489
490 self.write(locale, {
491 url: {
492 priority: priority,
493 changefreq: 'daily',
494 loc: url,
495 workflowGuid: page.workflowGuid,
496 workflowLocale: locale
497 }
498 });
499 }
500 }
501
502 };
503
504 // Append `s` to an array set aside for the map entries
505 // for the host `locale`.
506
507 self.write = function(locale, s) {
508 self.maps[locale] = self.maps[locale] || [];
509 self.maps[locale].push(s);
510 };
511
512 self.addRoutes = function() {
513 // Deliver from our tiny little fake cache filesystem
514 self.apos.app.get('/sitemap.xml', function(req, res) {
515 return self.sendCache(res, 'sitemap.xml');
516 });
517 self.apos.app.get('/sitemaps/*', function(req, res) {
518 return self.sendCache(res, 'sitemaps/' + req.params[0]);
519 });
520 };
521
522 self.sendCache = function(res, path) {
523 return self.cache.get(path, function(err, file) {
524 if (err) {
525 return fail(err);
526 }
527 if (!file) {
528 // If anything else exists in our little filesystem, this
529 // should be a 404 (think of a URL like /sitemap/madeupstuff).
530 // Otherwise it just means the
531 // cache has expired or has never been populated.
532 //
533 // Check for the sitemap index or, if we're not
534 // running in that mode, check for sitemap.xml
535 //
536 // Without this check every 404 would cause a lot of work to be done.
537 return self.cache.get(self.perLocale ? 'sitemaps/index.xml' : 'sitemap.xml', function(err, exists) {
538 if (err) {
539 return fail(err);
540 }
541 if (exists) {
542 return notFound();
543 }
544 return self.cacheAndRetry(res, path);
545 });
546 }
547 return res.contentType('text/xml').send(file.data);
548 });
549
550 function notFound() {
551 return res.status(404).send('not found');
552 }
553
554 function fail(err) {
555 console.error(err);
556 return res.status(500).send('error');
557 }
558 };
559
560 self.cacheAndRetry = function(res, path) {
561 return self.map(function(err) {
562 if (err) {
563 return fail(err);
564 }
565 return self.sendCache(res, path);
566 });
567 function fail(err) {
568 console.error(err);
569 return res.status(500).send('error');
570 }
571 };
572
573 self.enableCache = function() {
574 self.cache = self.apos.caches.get('apostrophe-sitemap');
575 };
576 }
577};