UNPKG

18.3 kBJavaScriptView Raw
1/* jshint node:true */
2
3var xml2js = require('xml2js');
4var fs = require('fs');
5var util = require('util');
6var async = require('async');
7var moment = require('moment');
8var _ = require('lodash');
9var cheerio = require('cheerio');
10var splitHtml = require('split-html');
11var path = require('path');
12var request = require('request');
13var urls = require('url');
14
15module.exports = function(self, argv, callback) {
16 var data;
17 var parent;
18 var req = self._apos.getTaskReq();
19 var base;
20
21 if (argv.base) {
22 base = argv.base;
23 } // otherwise we'll try to get it from the XML later on
24
25 function resolve(url) {
26 if (!base) {
27 return url;
28 }
29 return urls.resolve(base, url);
30 }
31
32 var creatorToCredit;
33
34 return async.series({
35 usage: function(callback) {
36 if (argv._.length !== 3)
37 {
38 return callback('The first argument must be a Wordpress XML export filename. The second argument must be the slug of an existing blog page on your A2 site.');
39 }
40 return callback(null);
41 },
42 getCreatorToCredit: function(callback) {
43 if (!argv['creator-to-credit']) {
44 return setImmediate(callback);
45 }
46 var parse = require('csv-parse');
47 var data = fs.readFileSync(argv['creator-to-credit'], 'utf8');
48 return parse(data, {}, function(err, _info) {
49 if (err) {
50 return callback(err);
51 }
52 creatorToCredit = {};
53 _.each(_info, function(row) {
54 if (row.length >= 2) {
55 creatorToCredit[row[0]] = row[1];
56 }
57 });
58 return callback(null);
59 });
60 },
61 getParent: function(callback) {
62 return self.indexes.getOne(req, { slug: argv._[2] }, {}, function(err, _parent) {
63 if (err) {
64 return callback(err);
65 }
66 if (!_parent) {
67 return callback('No such parent blog page found');
68 }
69 parent = _parent;
70 return callback(null);
71 });
72 },
73 parse: function(callback) {
74 return xml2js.parseString(fs.readFileSync(argv._[1], 'utf8'), function(err, result) {
75 if (err) {
76 return callback(err);
77 }
78 data = result;
79 return callback(null);
80 });
81 },
82 insert: function(callback) {
83 var count = 0;
84 if ((!base) && (data.rss.channel[0]['wp:base_blog_url'])) {
85 base = data.rss.channel[0]['wp:base_blog_url'][0];
86 }
87 var documents = data.rss.channel[0].item;
88 var posts = _.filter(documents, function(post) {
89 return (post['wp:post_type'] && (post['wp:post_type'][0] === 'post'));
90 });
91
92 // Optional parallel processing
93 return async.eachLimit(posts, argv.parallel || 1, function(post, callback) {
94 var html = post['content:encoded'][0];
95 count++;
96
97 var publishedAt = new Date(post.pubDate[0]);
98 var items = [];
99
100 var categories = [];
101 var tags = [];
102
103 categories = _.map(post.category, function(category) {
104 return category._;
105 });
106
107 tags = _.map(post.tag, function(tag) {
108 return tag._;
109 });
110
111 if (argv['with-tag']) {
112 if (!_.contains(tags, argv['with-tag'])) {
113 return setImmediate(callback);
114 }
115 }
116 if (argv['with-category']) {
117 if (!_.contains(categories, argv['with-category'])) {
118 return setImmediate(callback);
119 }
120 }
121
122 if (argv['without-tags']) {
123 var withoutTags = argv['without-tags'].split(/\s*,\s*/);
124 if (_.intersection(tags, withoutTags).length) {
125 return setImmediate(callback);
126 }
127 }
128
129 if (argv['without-categories']) {
130 var withoutCategories = argv['without-categories'].split(/\s*,\s*/);
131 if (_.intersection(categories, withoutCategories).length) {
132 return setImmediate(callback);
133 }
134 }
135
136 if (argv['ignore-tags']) {
137 tags = [];
138 }
139
140 if (argv['ignore-categories']) {
141 categories = [];
142 }
143
144 // a2 does not distinguish tags from categories
145 tags = categories.concat(tags);
146
147 var thumbnail;
148
149 return async.series({
150 meta: function(callback) {
151 return async.eachSeries(post['wp:postmeta'] || [], function(meta, callback) {
152 var key = meta['wp:meta_key'] && meta['wp:meta_key'][0];
153 var code = meta['wp:meta_value'] && meta['wp:meta_value'][0];
154 if (key === 'embed') {
155 var matches = code.match(/(http"|https:)?\/\/[^'"]+/);
156 if (matches) {
157 return self._apos.acceptVideo(req, { url: matches[0] }, function(err, video) {
158 if (err) {
159 console.error('WARNING: Apostrophe couldn\'t figure out what to do with this embedded item: ' + code);
160 } else {
161 items.push({
162 type: (video.type === "video") ? 'video' : 'embed',
163 video: matches[0],
164 thumbnail: video.thumbnail
165 });
166 }
167 return callback(null);
168 });
169 }
170 } else if (key === 'Image') {
171 var src = resolve(code);
172 // encoding: null to get the binary file as a
173 // buffer rather than a UTF8 string
174 return request(src, { encoding: null }, function(err, response, body) {
175 if (err || (response.status >= 300)) {
176 console.error('WARNING: image ' + src + ' not accessible, ignoring');
177 return setImmediate(callback);
178 }
179 var tmp = self._apos.uploadfs.getTempPath();
180 var name = self._apos.generateId();
181 tmp += '/' + name;
182 fs.writeFileSync(tmp, body);
183 name = path.basename(src);
184 return self._apos.acceptFiles(req, { path: tmp, name: name }, function(err, infos) {
185 if (err || (!infos.length)) {
186 console.error('WARNING: image ' + src + ' downloaded by not accepted by Apostrophe');
187 return callback(null);
188 }
189
190 thumbnail = {
191 type: 'area',
192 items: [
193 {
194 type: 'slideshow',
195 ids: [ infos[0]._id ]
196 }
197 ]
198 };
199 return callback(null);
200 });
201 });
202 }
203 return setImmediate(callback);
204 }, callback);
205 },
206 body: function(callback) {
207
208 // Cope with non-container shortcodes by special-
209 // casing them and turning into HTML tags for our
210 // HTML parser. Add new ones in alternation
211 // with | in the regex below. This is the only
212 // way to go because they don't have an XHTML-style
213 // self-closing notation.
214
215 html = html.replace(/\[(portfolio_slideshow)(.*?)\]/g, function(everything, name, attributes) {
216 return '<wps' + name + attributes + ' />';
217 });
218
219 // Cope with container shortcodes by converting their
220 // syntax so that they appear as HTML tags to
221 // our HTML parser.
222
223 var before = html;
224 html = html.replace(/\[(\w+)(.*?)\](.*?)\[\/(\w+)\]/g, function(everything, name, attributes, body, closeName) {
225 return '<wps' + name + attributes + '>' + body + '</wps' + closeName + '>';
226 });
227
228 // Split the markup up into an alternation of
229 // special cases with chunks of ordinary markup.
230
231 // Special cases are currently: [youtube], [vimeo], [caption],
232 // and <a><img /></a>.
233
234 var fragments = splitHtml(html, 'wpsportfolio_slideshow, wpsbutton, wpsyoutube, wpsvimeo, wpscaption, a', function($el) {
235 if ($el[0].name === 'a') {
236 return $el.find('img').length;
237 } else {
238 return true;
239 }
240 });
241
242 var i = 0;
243 return async.eachSeries(fragments, function(fragment, callback) {
244 var isSpecial = i & 1;
245 i++;
246 if (!isSpecial) {
247 // In Wordpress, every double newline
248 // is a paragraph break. This is accomplished
249 // with this hideously complex function on
250 // every single page render:
251
252 // https://core.trac.wordpress.org/browser/tags/4.0/src/wp-includes/formatting.php#L0
253
254 // We are not going to do any such terrible thing.
255 // We import them as a simple pair of br's. People
256 // can make nice paragraphs later in our editor
257 // if they want, but I can't dice this sushi.
258 //
259 // Offer an option not to do this since a few
260 // Wordpress blogs may have it turned off. -Tom
261
262 if (!argv['no-autop']) {
263 fragment = fragment.replace(/\r?\n\r?\n/g, '<br />\n<br />\n');
264 }
265
266 var item = {
267 type: 'richText',
268 content: fragment
269 };
270 self._apos.itemTypes.richText.sanitize(item);
271 items.push(item);
272 return setImmediate(callback);
273 }
274 var $ = cheerio.load('<div>' + fragment + '</div>');
275
276 var $img = $('img');
277 if ($img.length) {
278 // wpscaption, or img inside a, or just plain img
279 var src = $img.attr('src');
280 var href = $('a').attr('href');
281 // Sometimes it's an attribute...
282 var title = $('wpscaption').attr('caption');
283 if (!title) {
284 // But sometimes it's a text node, because
285 // why the hell not?
286 title = $('wpscaption').text().trim();
287 }
288 if (href && src) {
289 if (path.extname(href) === path.extname(src)) {
290 // The 'a' is a link to a better version
291 // of the image
292 src = href;
293 }
294 }
295 if (!src) {
296 console.error('WARNING: missing image URL, ignoring image');
297 return setImmediate(callback);
298 }
299 // encoding: null to get the binary file as a
300 // buffer rather than a UTF8 string
301 return request(resolve(src), { encoding: null }, function(err, response, body) {
302 if (err || (response.status >= 300)) {
303 console.error('WARNING: image ' + src + ' not accessible, ignoring');
304 return setImmediate(callback);
305 }
306 var tmp = self._apos.uploadfs.getTempPath();
307 var name = self._apos.generateId();
308 tmp += '/' + name;
309 fs.writeFileSync(tmp, body);
310 name = path.basename(src);
311 return self._apos.acceptFiles(req, { path: tmp, name: name }, function(err, infos) {
312 if (err || (!infos.length)) {
313 console.error('WARNING: image ' + src + ' downloaded by not accepted by Apostrophe');
314 return callback(null);
315 }
316 // acceptFiles doesn't take metadata because
317 // annotation is a later pass in the Apostrophe
318 // UI. So add the title now if we got one.
319
320 var file = infos[0];
321 var showTitles = false;
322 var showDescriptions = false;
323 if (title) {
324 if (argv['caption-as-description']) {
325 file.description = title;
326 showDescriptions = true;
327 } else {
328 file.title = title;
329 showTitles = true;
330 }
331 }
332 return self._apos.files.update({
333 _id: infos[0]._id
334 }, {
335 $set: {
336 title: title || infos[0].title,
337 description: title || ''
338 }
339 }, function(err) {
340 items.push({
341 type: 'slideshow',
342 ids: [ infos[0]._id ],
343 showTitles: showTitles,
344 showDescriptions: showDescriptions
345 });
346 return callback(null);
347 });
348 });
349 });
350 } else if ($('wpsyoutube').length || $('wpsvimeo').length) {
351 // simple video shortcodes
352 var url = $('wpsyoutube, wpsvimeo').text().trim();
353 return self._apos.acceptVideo(req, { url: url }, function(err, video) {
354 if (err) {
355 console.error('WARNING: Apostrophe couldn\'t figure out what to do with this embedded item: ' + url);
356 } else {
357 items.push({
358 type: 'video',
359 video: url,
360 thumbnail: video.thumbnail
361 });
362 }
363 return callback(null);
364 });
365 } else if ($('wpsportfolio_slideshow').length) {
366 var excluded = [];
367 var exclude = $('wpsportfolio_slideshow').attr('exclude');
368 if (exclude && exclude.length) {
369 excluded = exclude.split(/\s*,\s*/);
370 }
371 // these are joined to attachment "posts" via
372 // the wp:post_parent property. Find the slides
373 // and make a slideshow
374
375 var images = [];
376 _.each(documents, function(slide) {
377 if (_.contains(excluded, slide['wp:post_id'][0])) {
378 console.log('excluding');
379 return;
380 }
381 if (!slide['wp:post_parent']) {
382 console.log('no parent');
383 return;
384 }
385 if (!((slide['wp:post_type'][0] == 'attachment') && (slide['wp:post_parent'][0] == post['wp:post_id'][0]))) {
386 return;
387 }
388 if (!slide['wp:attachment_url']) {
389 console.log('NO ATTACHMENT URL');
390 return;
391 }
392 images.push(slide['wp:attachment_url'][0]);
393 });
394 var candidates = [];
395 return async.series({
396 get: function(callback) {
397 return async.eachSeries(images, function(image, callback) {
398 return request(resolve(image), { encoding: null }, function(err, response, body) {
399 if (err) {
400 console.error(err);
401 return setImmediate(callback);
402 }
403 var tmp = self._apos.uploadfs.getTempPath();
404 var name = self._apos.generateId();
405 tmp += '/' + name;
406 fs.writeFileSync(tmp, body);
407 name = path.basename(image);
408 candidates.push({ path: tmp, name: name });
409 return callback(null);
410 }
411 );
412 }, callback);
413 },
414 accept: function(callback) {
415 return self._apos.acceptFiles(req, candidates, function(err, infos) {
416 if (err || (!infos.length)) {
417 console.error('WARNING: image ' + src + ' downloaded by not accepted by Apostrophe');
418 return callback(null);
419 }
420 items.push({
421 type: 'slideshow',
422 ids: _.pluck(infos, '_id')
423 });
424 return callback(null);
425 });
426 }
427 }, callback);
428 } else {
429 return callback(new Error('Unexpected special, our parser should not have allowed that to happen: ' + fragment));
430 }
431 }, function(err) {
432 if (err) {
433 return callback(err);
434 }
435 var bodyArea = argv['body-area'] || 'body';
436 var credit;
437 var creator = post['dc:creator'];
438 if (creator && creator.length) {
439 credit = creator[0];
440 }
441 if (creatorToCredit && _.has(creatorToCredit, credit)) {
442 credit = creatorToCredit[credit];
443 }
444 var a2Post = {
445 tags: tags,
446 type: self.pieceName,
447 title: post.title[0],
448 publishedAt: publishedAt,
449 publicationDate: moment(publishedAt).format('YYYY-MM-DD'),
450 publicationTime: moment(publishedAt).format('HH:MM')
451 };
452 if (credit) {
453 a2Post.credit = credit;
454 }
455 if (thumbnail) {
456 a2Post.thumbnail = thumbnail;
457 }
458 a2Post[bodyArea] = {
459 type: 'area',
460 items: items
461 };
462 if (post['wp:status'] && post['wp:status'][0] === 'publish') {
463 a2Post.published = true;
464 }
465 return self.pieces.putOne(req,
466 undefined,
467 { parent: parent },
468 a2Post,
469 callback);
470 }, callback);
471 }
472 }, callback);
473 }, callback);
474 }
475 }, callback);
476};