UNPKG

11.4 kBJavaScriptView Raw
1'use strict';
2
3var EventEmitter = require('events').EventEmitter,
4 emits = require('emits'),
5 html = require('htmlparser2'),
6 domutils = require('domutils'),
7 util = require('util'),
8 uuid = require('uuid'),
9 async = require('async'),
10 url = require('url'),
11 request = require('request-promise'),
12 probeImageSize = require('probe-image-size'),
13 _ = require('lodash'),
14 sizeOf = require('image-size'),
15 validator = require('validator'),
16 helpers = require('./helpers'),
17 DEFAULTS = {
18 'amp-img': {
19 layout: 'responsive',
20 width: 600,
21 height: 400
22 },
23 'amp-anim': {
24 layout: 'responsive',
25 width: 600,
26 height: 400
27 },
28 'amp-iframe': {
29 layout: 'responsive',
30 width: 600,
31 height: 400,
32 sandbox: 'allow-scripts allow-same-origin'
33 },
34 'amp-youtube': {
35 layout: 'responsive',
36 width: 600,
37 height: 400
38 },
39 'request_timeout': 3000
40 };
41
42// these are formats supported by image-size but not probe-image-size
43const FETCH_ONLY_FORMATS = [
44 'cur', 'icns', 'ico', 'dds'
45];
46
47/**
48* Amperizer constructor. Borrows from Minimize.
49*
50* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L15
51*
52* @constructor
53* @param {Object} options Options object
54* @api public
55*/
56function Amperize(options) {
57 this.config = _.merge({}, DEFAULTS, options || {});
58 this.emits = emits;
59
60 this.htmlParser = new html.Parser(
61 new html.DomHandler(this.emits('read'))
62 );
63}
64
65util.inherits(Amperize, EventEmitter);
66
67/**
68* Parse the content and call the callback. Borrowed from Minimize.
69*
70* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L51
71*
72* @param {String} content HTML
73* @param {Function} callback
74* @api public
75*/
76Amperize.prototype.parse = function parse(content, callback) {
77 var id;
78
79 if (typeof callback !== 'function') {
80 throw new Error('No callback provided');
81 }
82
83 id = uuid.v4();
84
85 this.once('read', this.amperizer.bind(this, id));
86 this.once('parsed: ' + id, callback);
87
88 this.htmlParser.parseComplete(content);
89};
90
91/**
92* Turn a traversible DOM into string content. Borrowed from Minimize.
93*
94* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L74
95*
96* @param {String} id
97* @param {Object} error
98* @param {Object} dom Traversible DOM object
99* @api private
100*/
101Amperize.prototype.amperizer = function amperizer(id, error, dom) {
102 if (error) {
103 throw new Error('Amperizer failed to parse DOM', error);
104 }
105
106 this.traverse(dom, '', this.emits('parsed: ' + id));
107};
108
109/**
110* Reduce the traversible DOM object to a string. Borrows from Minimize.
111*
112* https://github.com/Swaagie/minimize/blob/4b815e274a424ca89551d28c4e0dd8b06d9bbdc2/lib/minimize.js#L90
113*
114* @param {Array} data
115* @param {String} html Compiled HTML contents
116* @param {Function} done Callback function
117* @api private
118*/
119Amperize.prototype.traverse = async function traverse(data, html, done) {
120 var self = this;
121 var imageSizeCache = {};
122
123 var requestOptions = {
124 // We need the user-agent, otherwise some https request may fail (e. g. cloudfare)
125 headers: {
126 'User-Agent': 'Mozilla/5.0 Safari/537.36'
127 },
128 timeout: self.config['request_timeout'],
129 encoding: null
130 };
131
132 // check if element.width is smaller than 300 px. In that case, we shouldn't use
133 // layout="responsive", because the media element will be stretched and it doesn't
134 // look nice. Use layout="fixed" instead to fix that.
135 function setLayoutAttribute(element) {
136 var layout = element.attribs.width < 300 ? layout = 'fixed' : self.config[element.name].layout;
137 element.attribs.layout = !element.attribs.layout ? layout : element.attribs.layout;
138 }
139
140 // Certain component src attribute must be with 'https' protocol otherwise it will not
141 // get validated by AMP. If we're unable to replace it, we will deal with the valitation
142 // error, but at least we tried.
143 function useSecureSchema(element) {
144 if (element.attribs && element.attribs.src) {
145 if (element.attribs.src.indexOf('https://') === -1) {
146 if (element.attribs.src.indexOf('http://') === 0) {
147 // Replace 'http' with 'https', so the validation passes
148 element.attribs.src = element.attribs.src.replace(/^http:\/\//i, 'https://');
149 } else if (element.attribs.src.indexOf('//') === 0) {
150 // Giphy embedded iFrames are without protocol and start with '//', so at least
151 // we can fix those cases.
152 element.attribs.src = 'https:' + element.attribs.src;
153 }
154 }
155 }
156 }
157
158 // probe will fetch the minimal amount of data needed to determine
159 // the image dimensions so it's more performant than a full fetch
160 function _probeImageSize(url) {
161 return probeImageSize(
162 url,
163 requestOptions
164 ).then(function (result) {
165 imageSizeCache[url] = result;
166 return result;
167 });
168 }
169
170 // fetch the full image before reading dimensions using image-size,
171 // it's slower but has better format support
172 function _fetchImageSize(url) {
173 return request(
174 url,
175 requestOptions
176 ).then(function (response) {
177 var result = sizeOf(response);
178 imageSizeCache[url] = result;
179 return result;
180 });
181 }
182
183 // select appropriate method to get image size
184 function _getImageSize(url) {
185 // use cached image size if we've already seen this url
186 if (imageSizeCache[url]) {
187 return Promise.resolve(imageSizeCache[url]);
188 }
189
190 // fetch full image for formats we can't probe
191 const extensionMatch = url.match(/(?:\.)([a-zA-Z]{3,4})(\?|$)/) || [];
192 const extension = (extensionMatch[1] || '').toLowerCase();
193 if (FETCH_ONLY_FORMATS.includes(extension)) {
194 return _fetchImageSize(url);
195 }
196
197 // probe partial image everything else
198 return _probeImageSize(url);
199 }
200
201 // convert <img> to <amp-img> or <amp-anim>, fetching dimensions of
202 // external images. If anything fails leave the element as an <img>
203 function amperizeImageElem(element) {
204 return async function() {
205 if (!element.attribs || !element.attribs.src) {
206 return;
207 }
208
209 var src = url.parse(element.attribs.src).href;
210
211 // when we have a gif it should be <amp-anim>.
212 element.name = src.match(/(\.gif$)/) ? 'amp-anim' : 'amp-img';
213
214 if (src.indexOf('http') === 0) {
215 // external image, fetch real dimensions
216 try {
217 if (!validator.isURL(src)) {
218 element.name = 'img';
219 return;
220 }
221
222 var dimensions = await _getImageSize(src);
223
224 // CASE: `.ico` files might have multiple images and therefore multiple sizes.
225 // We return the largest size found (image-size default is the first size found)
226 if (dimensions.images) {
227 dimensions.width = _.maxBy(dimensions.images, function (w) {return w.width;}).width;
228 dimensions.height = _.maxBy(dimensions.images, function (h) {return h.height;}).height;
229 }
230
231 if (!dimensions.width || !dimensions.height) {
232 element.name = 'img';
233 return;
234 }
235
236 element.attribs.width = dimensions.width;
237 element.attribs.height = dimensions.height;
238
239 } catch (err) {
240 element.name = 'img';
241 return;
242 }
243 } else {
244 // local image, use default fallback
245 element.attribs.width = self.config[element.name].width;
246 element.attribs.height = self.config[element.name].height;
247 }
248
249 if (!element.attribs.layout) {
250 setLayoutAttribute(element);
251 }
252 }
253 }
254
255
256
257 // convert all of the img elements first so that we can perform lengthy
258 // network requests in parallel before sequentially traversing the DOM
259 if (self.config['amp-img']) {
260 var imgTest = function(elem) {
261 return elem.name === 'img' && elem.attribs.src;
262 }
263 var imgElems = domutils.findAll(elem => imgTest(elem), data);
264 var imgTasks = imgElems.map(elem => amperizeImageElem(elem));
265 await async.parallelLimit(imgTasks, 10);
266 }
267
268 // sequentially traverse the DOM
269 async.reduce(data, html, function reduce(html, element, step) {
270 var children;
271
272 if (/(style|script|textarea|link)/.test(element.name)) {
273 return step(null, html);
274 }
275
276 function close(error, html) {
277 html += helpers.close(element);
278 step(null, html);
279 }
280
281 function enter() {
282 children = element.children;
283 html += helpers[element.type](element);
284
285 if (!children || !children.length) {
286 return close(null, html);
287 }
288
289 setImmediate(function delay() {
290 traverse.call(self, children, html, close);
291 });
292 }
293
294 if (element.name === 'iframe') {
295 if (!element.attribs.src) {
296 return enter();
297 }
298
299 var youtubeId = element.attribs.src.match(/^.*(youtu.be\/|youtube(-nocookie)?.com\/(v\/|.*u\/\w\/|embed\/|.*v=))([\w-]{11}).*/)
300 useSecureSchema(element);
301
302 if (youtubeId) {
303 element.name = 'amp-youtube';
304 element.attribs['data-videoid'] = youtubeId[4];
305 delete element.attribs.src;
306 delete element.attribs.sandbox;
307 delete element.attribs.allowfullscreen;
308 delete element.attribs.allow;
309 delete element.attribs.frameborder;
310 } else {
311 element.name = 'amp-iframe';
312 element.attribs.sandbox = !element.attribs.sandbox ? self.config['amp-iframe'].sandbox : element.attribs.sandbox;
313 }
314
315 if (!element.attribs.width || !element.attribs.height || !element.attribs.layout) {
316 element.attribs.width = !element.attribs.width ? self.config['amp-iframe'].width : element.attribs.width;
317 element.attribs.height = !element.attribs.height ? self.config['amp-iframe'].height : element.attribs.height;
318 setLayoutAttribute(element);
319 }
320 }
321
322 if (element.name === 'audio') {
323 element.name = 'amp-audio';
324 useSecureSchema(element);
325 }
326
327 if (element.attribs && element.attribs.src && element.parent && element.parent.name === 'amp-audio') {
328 useSecureSchema(element);
329 }
330
331 return enter();
332 }, done);
333};
334
335module.exports = Amperize;