UNPKG

14.2 kBJavaScriptView Raw
1var xml2js = require('xml2js');
2var fs = require('fs');
3var util = require('util');
4var async = require('async');
5var moment = require('moment');
6var _ = require('lodash');
7var cheerio = require('cheerio');
8var splitHtml = require('split-html');
9var path = require('path');
10var request = require('request');
11var util = require('util');
12
13module.exports = function(self, argv, callback) {
14 var data;
15 var parent;
16 var req = self._apos.getTaskReq();
17 return async.series({
18 usage: function(callback) {
19 if (argv._.length !== 3)
20 {
21 return callback('The first argument must be a Wordpress XML export filename. The second argument must be the slug of an existing blog page on your A2 site.');
22 }
23 return callback(null);
24 },
25 getParent: function(callback) {
26 return self.indexes.getOne(req, { slug: argv._[2] }, {}, function(err, _parent) {
27 if (err) {
28 return callback(err);
29 }
30 if (!_parent) {
31 return callback('No such parent blog page found');
32 }
33 parent = _parent;
34 return callback(null);
35 });
36 },
37 parse: function(callback) {
38 return xml2js.parseString(fs.readFileSync(argv._[1], 'utf8'), function(err, result) {
39 if (err) {
40 return callback(err);
41 }
42 data = result;
43 return callback(null);
44 });
45 },
46 insert: function(callback) {
47 var count = 0;
48 var documents = data.rss.channel[0].item;
49 posts = _.filter(documents, function(post) {
50 return (post['wp:post_type'] && (post['wp:post_type'][0] === 'post'));
51 });
52
53 // Optional parallel processing
54 return async.eachLimit(posts, argv.parallel || 1, function(post, callback) {
55 var html = post['content:encoded'][0];
56 count++;
57 console.log(post['title'][0] + ': ' + count + ' of ' + posts.length);
58 var publishedAt = new Date(post.pubDate[0]);
59 var items = [];
60
61 return async.series({
62 meta: function(callback) {
63 return async.eachSeries(post['wp:postmeta'] || [], function(meta, callback) {
64 var key = meta['wp:meta_key'] && meta['wp:meta_key'][0];
65 var code = meta['wp:meta_value'] && meta['wp:meta_value'][0];
66 if (key === 'embed') {
67 var matches = code.match(/(http"|https:)?\/\/[^'"]+/);
68 if (matches) {
69 return self._apos.acceptVideo(req, { url: matches[0] }, function(err, video) {
70 if (err) {
71 console.error('WARNING: Apostrophe couldn\'t figure out what to do with this embedded item: ' + code);
72 } else {
73 items.push({
74 type: (video.type === "video") ? 'video' : 'embed',
75 video: matches[0],
76 thumbnail: video.thumbnail
77 });
78 }
79 return callback(null);
80 });
81 }
82 }
83 return setImmediate(callback);
84 }, callback);
85 },
86 body: function(callback) {
87
88 // Cope with non-container shortcodes by special-
89 // casing them and turning into HTML tags for our
90 // HTML parser. Add new ones in alternation
91 // with | in the regex below. This is the only
92 // way to go because they don't have an XHTML-style
93 // self-closing notation.
94
95 html = html.replace(/\[(portfolio_slideshow)(.*?)\]/g, function(everything, name, attributes) {
96 return '<wps' + name + attributes + ' />';
97 });
98
99 // Cope with container shortcodes by converting their
100 // syntax so that they appear as HTML tags to
101 // our HTML parser.
102
103 var before = html;
104 html = html.replace(/\[(\w+)(.*?)\](.*?)\[\/(\w+)\]/g, function(everything, name, attributes, body, closeName) {
105 return '<wps' + name + attributes + '>' + body + '</wps' + closeName + '>';
106 });
107
108 // Split the markup up into an alternation of
109 // special cases with chunks of ordinary markup.
110
111 // Special cases are currently: [youtube], [vimeo], [caption],
112 // and <a><img /></a>.
113
114 var fragments = splitHtml(html, 'wpsportfolio_slideshow, wpsbutton, wpsyoutube, wpsvimeo, wpscaption, a', function($el) {
115 if ($el[0].name === 'a') {
116 return $el.find('img').length;
117 } else {
118 return true;
119 }
120 });
121
122 var i = 0;
123 return async.eachSeries(fragments, function(fragment, callback) {
124 var isSpecial = i & 1;
125 i++;
126 if (!isSpecial) {
127 // In Wordpress, every double newline
128 // is a paragraph break. This is accomplished
129 // with this hideously complex function on
130 // every single page render:
131
132 // https://core.trac.wordpress.org/browser/tags/4.0/src/wp-includes/formatting.php#L0
133
134 // We are not going to do any such terrible thing.
135 // We import them as a simple pair of br's. People
136 // can make nice paragraphs later in our editor
137 // if they want, but I can't dice this sushi.
138 //
139 // Offer an option not to do this since a few
140 // Wordpress blogs may have it turned off. -Tom
141
142 if (!argv['no-autop']) {
143 fragment = fragment.replace(/\r?\n\r?\n/g, '<br />\n<br />\n');
144 }
145
146 var item = {
147 type: 'richText',
148 content: fragment
149 };
150 self._apos.itemTypes.richText.sanitize(item);
151 items.push(item);
152 return setImmediate(callback);
153 }
154 var $ = cheerio.load('<div>' + fragment + '</div>');
155
156 var $img = $('img');
157 if ($img.length) {
158 // wpscaption, or img inside a, or just plain img
159 var src = $img.attr('src');
160 var href = $('a').attr('href');
161 // Sometimes it's an attribute...
162 var title = $('wpscaption').attr('caption');
163 if (!title) {
164 // But sometimes it's a text node, because
165 // why the hell not?
166 title = $('wpscaption').text().trim();
167 }
168 if (href && src) {
169 if (path.extname(href) === path.extname(src)) {
170 // The 'a' is a link to a better version
171 // of the image
172 src = href;
173 }
174 }
175 if (!src) {
176 console.error('WARNING: missing image URL, ignoring image');
177 return setImmediate(callback);
178 }
179 // encoding: null to get the binary file as a
180 // buffer rather than a UTF8 string
181 return request(src, { encoding: null }, function(err, response, body) {
182 if (err || (response.status >= 300)) {
183 console.error('WARNING: image ' + src + ' not accessible, ignoring');
184 return setImmediate(callback);
185 }
186 var tmp = self._apos.uploadfs.getTempPath();
187 var name = self._apos.generateId();
188 tmp += '/' + name;
189 fs.writeFileSync(tmp, body);
190 name = path.basename(src);
191 return self._apos.acceptFiles(req, { path: tmp, name: name }, function(err, infos) {
192 if (err) {
193 return callback(err);
194 }
195 if (!infos.length) {
196 console.error('WARNING: image ' + src + ' downloaded by not accepted by Apostrophe');
197 return callback(null);
198 }
199 // acceptFiles doesn't take metadata because
200 // annotation is a later pass in the Apostrophe
201 // UI. So add the title now if we got one.
202
203 var file = infos[0];
204 var showTitles = false;
205 var showDescriptions = false;
206 if (title) {
207 if (argv['caption-as-description']) {
208 file.description = title;
209 showDescriptions = true;
210 } else {
211 file.title = title;
212 showTitle = true;
213 }
214 }
215 return self._apos.files.update({
216 _id: infos[0]._id
217 }, {
218 $set: {
219 title: title || infos[0].title,
220 description: title || ''
221 }
222 }, function(err) {
223 items.push({
224 type: 'slideshow',
225 ids: [ infos[0]._id ],
226 showTitles: showTitles,
227 showDescriptions: showDescriptions
228 });
229 return callback(null);
230 });
231 });
232 });
233 } else if ($('wpsyoutube').length || $('wpsvimeo').length) {
234 // simple video shortcodes
235 var url = $('wpsyoutube, wpsvimeo').text().trim();
236 console.log(post.title[0] + ' has video');
237 return self._apos.acceptVideo(req, { url: url }, function(err, video) {
238 if (err) {
239 console.error('WARNING: Apostrophe couldn\'t figure out what to do with this embedded item: ' + url);
240 } else {
241 items.push({
242 type: 'video',
243 video: url,
244 thumbnail: video.thumbnail
245 });
246 }
247 return callback(null);
248 });
249 } else if ($('wpsportfolio_slideshow').length) {
250 var excluded = [];
251 var exclude = $('wpsportfolio_slideshow').attr('exclude');
252 if (exclude && exclude.length) {
253 excluded = exclude.split(/\s*,\s*/);
254 }
255 // these are joined to attachment "posts" via
256 // the wp:post_parent property. Find the slides
257 // and make a slideshow
258
259 var images = [];
260 _.each(documents, function(slide) {
261 if (_.contains(excluded, slide['wp:post_id'][0])) {
262 console.log('excluding');
263 return;
264 }
265 if (!slide['wp:post_parent']) {
266 console.log('no parent');
267 return;
268 }
269 if (!((slide['wp:post_type'][0] == 'attachment') && (slide['wp:post_parent'][0] == post['wp:post_id'][0]))) {
270 return;
271 }
272 if (!slide['wp:attachment_url']) {
273 console.log('NO ATTACHMENT URL');
274 return;
275 }
276 images.push(slide['wp:attachment_url'][0]);
277 });
278 var candidates = [];
279 return async.series({
280 get: function(callback) {
281 return async.eachSeries(images, function(image, callback) {
282 return request(image, { encoding: null }, function(err, response, body) {
283 if (err) {
284 console.error(err);
285 return setImmediate(callback);
286 }
287 var tmp = self._apos.uploadfs.getTempPath();
288 var name = self._apos.generateId();
289 tmp += '/' + name;
290 fs.writeFileSync(tmp, body);
291 name = path.basename(image);
292 candidates.push({ path: tmp, name: name });
293 return callback(null);
294 }
295 );
296 }, callback);
297 },
298 accept: function(callback) {
299 return self._apos.acceptFiles(req, candidates, function(err, infos) {
300 if (err) {
301 return callback(err);
302 }
303 items.push({
304 type: 'slideshow',
305 ids: _.pluck(infos, '_id')
306 });
307 return callback(null);
308 });
309 }
310 }, callback);
311 } else {
312 return callback(new Error('Unexpected special, our parser should not have allowed that to happen: ' + fragment));
313 }
314 }, function(err) {
315 if (err) {
316 return callback(err);
317 }
318 var bodyArea = argv['body-area'] || 'body';
319 var a2Post = {
320 type: self.pieceName,
321 title: post.title[0],
322 publishedAt: publishedAt,
323 publicationDate: moment(publishedAt).format('YYYY-MM-DD'),
324 publicationTime: moment(publishedAt).format('HH:MM')
325 };
326 a2Post[bodyArea] = {
327 type: 'area',
328 items: items
329 };
330 if (post['wp:status'] && post['wp:status'][0] === 'publish') {
331 a2Post.published = true;
332 }
333 return self.pieces.putOne(req,
334 undefined,
335 { parent: parent },
336 a2Post,
337 callback);
338 }, callback);
339 }
340 }, callback);
341 }, callback);
342 }
343 }, callback);
344};