UNPKG

19.4 kBPlain TextView Raw
1
2
3import * as async from 'async';
4import * as crypto from 'crypto';
5import * as fs from 'fs';
6import {emptyDir, ensureDir, rmdir, writeFile} from "fs-extra";
7import * as moment from 'moment';
8import * as OS from 'os';
9import * as request from 'request';
10import * as Debug from 'debug';
11import * as UrlParser from 'url';
12import * as mkdirp from 'mkdirp';
13
14const debug = Debug('bablic:seo');
15const zlib = require('zlib');
16
17import {
18 ExtendedRequest, ExtendedResponse, Middleware, getLink, KeywordMapper, SiteMeta, LastModifiedByLocale,
19 BablicLinkOptions
20} from "./common";
21import {OutgoingMessage, ServerResponse} from "http";
22import {Stats} from "fs";
23import {RequestResponse} from "request";
24import _ = require("lodash");
25
26export interface SeoOptions {
27 useCache?:boolean;
28 defaultCache?:string[];
29 cacheDir?: string;
30 test?:boolean;
31 altHost?: string;
32 cacheDays?: number;
33}
34
35export interface SeoSubDirOptions {
36 subDir: boolean;
37 subDirBase: string;
38 subDirOptional: boolean;
39}
40
41export class SeoMiddleware{
42 private subDirOptions: BablicLinkOptions;
43 constructor(private siteId: string, private options: SeoOptions, subDirOptions: SeoSubDirOptions){
44 this.subDirOptions = Object.assign({returnFull: true}, subDirOptions);
45 }
46 async writeToCache(url: string, locale: string, translated: string): Promise<void> {
47 let cachePath = fullPathFromUrl(url, locale, this.options.cacheDir);
48 try {
49
50 await writeFile(cachePath, translated);
51 } catch (e) {
52 const cacheDir = getCacheDir(locale, this.options.cacheDir);
53 debug("create cache dir", cacheDir);
54 await ensureDir(cacheDir);
55 debug("created");
56 await writeFile(cachePath, translated);
57 }
58 }
59 getHtml(url: string, locale: string, html?: string): Promise<string> {
60 if (!isRenderHealthy) {
61 return Promise.reject(new Error("Render is not health"));
62 }
63 debug('getting from bablic', url, 'html:', !!html );
64 let ld = '';
65 if(this.subDirOptions.subDir) {
66 ld = '&ld=subdir';
67 if(this.subDirOptions.subDirBase)
68 ld += '&sdb=' + encodeURIComponent(this.subDirOptions.subDirBase);
69 if(this.subDirOptions.subDirOptional)
70 ld += '&sdo=true';
71 }
72 return new Promise<string>((resolve, reject) => {
73 request({
74 url: SEO_ROOT + "?site=" + this.siteId + "&el=" + locale + "&url=" + (encodeURIComponent(url)) + ld,
75 headers:{
76 "Accept-Encoding": "gzip,deflate"
77 },
78 method: 'POST',
79 json: {
80 html: html
81 },
82 timeout: 20000,
83 encoding:null,
84 }, (error:any, response:RequestResponse, body: any) => {
85 if (error)
86 return reject(error);
87
88 if (response.statusCode < 200 || response.statusCode >= 300)
89 return reject(new Error("Status-" + response.statusCode));
90
91 if (body == null)
92 return reject(new Error('empty response'));
93
94 debug('received translated html', response.statusCode);
95 resolve(body);
96
97 this.writeToCache(url, locale, body).catch((e) => {
98 debug("error writing to cache", e);
99 });
100 });
101 });
102 }
103 getFromCache(url: string, locale: string, skip: boolean, callback:(e?:Error, html?: Buffer | string, isValid?: boolean) => void) {
104 if (!this.options.useCache || skip)
105 return callback();
106
107 let file_path = fullPathFromUrl(url, locale, this.options.cacheDir);
108 fs.stat(file_path, (error:NodeJS.ErrnoException, file_stats: Stats) => {
109 if (error)
110 return callback(error);
111
112 fs.readFile(file_path, (error:NodeJS.ErrnoException, data: Buffer) => {
113 if (error)
114 return callback(error);
115
116 callback(error, data, cacheValid(file_stats, this.options.cacheDays || 1));
117 });
118 });
119 };
120 isEncoded(buffer) {
121 try {
122 // every gzip content start with 0x1f8b 2 bytes
123 let firstByte = buffer[0];
124 let secondByte = buffer[1];
125 return (firstByte == 0x1f) && (secondByte == 0x8b)
126 } catch (err) {
127 return false;
128 }
129 }
130 readHeaderAsString(res: ExtendedResponse, headerName: string): string {
131 let value = res.getHeader(headerName);
132 if (!value)
133 return "";
134 if (Array.isArray(value)) {
135 value = value[0];
136 }
137 if (typeof(value) !== "string") {
138 return value + "";
139 } else {
140 return value;
141 }
142
143 }
144 async purgeCache(): Promise<void> {
145 debug("purge cache", this.options.cacheDir);
146 await rmdir(this.options.cacheDir);
147 debug("purge done");
148 }
149 middleware(){
150 return (meta:SiteMeta,lastModified:LastModifiedByLocale, keywordsByLocale: KeywordMapper, reverseKeywordByLocale: KeywordMapper, req: ExtendedRequest, res: ExtendedResponse, next: () => void) => {
151
152 let replaceUrls = shouldReplaceUrls(req);
153 if (!shouldHandle(req) && !replaceUrls) {
154 debug('ignored', req.url);
155 return next();
156 }
157
158 let acceptGZIP = (req.headers['accept-encoding'] || '').indexOf('gzip') > -1;
159
160 delete req.headers['accept-encoding'];
161 req.bablic.proxied = true;
162
163 let protocol = req.headers['x-forwarded-proto'] || 'http';
164 let my_url = protocol + "://" + req.headers.host + req.originalUrl;
165 if (this.options.altHost)
166 my_url = "http://" + this.options.altHost + req.originalUrl;
167
168
169 this.getFromCache(my_url, req.bablic.locale, replaceUrls, (e, html, isValid) => {
170 let cache_only = false;
171 if (html) {
172 debug('flushing from cache');
173 res.setHeader('Content-Type', 'text/html; charset=utf-8');
174 res.setHeader('Content-Language', req.bablic.locale);
175
176 const encoded = this.isEncoded(html);
177 // if browser support gzip encoding
178 if (acceptGZIP) {
179 // adding gzip flag
180 if (encoded) {
181 res.setHeader('Content-Encoding', 'gzip');
182 }
183 }else{
184 // if the content from cache is gzipped
185 if (encoded) {
186 html = zlib.gunzipSync(html);
187 }
188 }
189
190 res.write(html);
191 res.end();
192
193 if (isValid)
194 return;
195 cache_only = true;
196 }
197
198 if (!isRenderHealthy && !replaceUrls) {
199 debug('render not healthy, skipping');
200 return next();
201 }
202
203
204 debug('overriding response');
205 let _end = res.end;
206 let _write = res.write;
207 let _writeHead = res.writeHead;
208
209 res.writeHead = (status, _headers) => {
210 res.statusCode = status;
211 if (_headers && typeof _headers === 'object') {
212 let results = [];
213 for (let key in _headers)
214 results.push(res.setHeader(key, _headers[key]));
215 return results;
216 }
217 };
218 let headers = {};
219 let _getHeader;
220 if (cache_only) {
221 _getHeader = res.getHeader;
222
223 res.finished = false;
224 Object.defineProperty(res,"headersSent",{
225 get:()=>{
226 return false;
227 },
228 configurable:true,
229 enumerable:true,
230 });
231
232 res.setHeader = (name, value) => headers[name.toLowerCase().trim()] = value;
233 res.removeHeader = name => headers[name.toLowerCase().trim()] = null;
234 res.getHeader = name => {
235 let local = headers[name.toLowerCase().trim()];
236 if (local)
237 return local;
238 if (local === null)
239 return;
240 return _getHeader.call(res, name);
241 };
242 }
243 let restore_override = () => {
244 if (!_write || !_end || !_writeHead)
245 return;
246 debug('undo override');
247 res.write = _write;
248 res.end = _end;
249 res.writeHead = _writeHead;
250
251 if (cache_only) {
252 _getHeader = null;
253 const getter = Object.getOwnPropertyDescriptor(OutgoingMessage.prototype,"headersSent");
254 Object.defineProperty(res,"headersSent",getter );
255 }
256
257 _write = _end = _writeHead = null;
258 };
259
260
261 let head_checked = false;
262 let is_html = null;
263 let chunks = [];
264 let check_head = () => {
265 if (head_checked)
266 return;
267
268 const ct = this.readHeaderAsString(res, 'content-type');
269 is_html = ct.indexOf('text/html') > -1 || replaceUrls;
270
271 if (!is_html) {
272 debug('not html', ct);
273 restore_override();
274 }
275 if (res.statusCode < 200 || res.statusCode >= 300) {
276 debug('error response', res.statusCode);
277 is_html = false;
278 restore_override();
279 }
280 head_checked = true;
281 };
282
283
284 let justAnObject: any = <any>res;
285 res.write = function(chunk?: any, encoding?: any, cb?: any) {
286 check_head();
287 if (!is_html) {
288 if (cache_only)
289 return;
290
291 debug('write original');
292 return res.write.apply(res, arguments);
293 }
294 if (chunk instanceof Buffer)
295 chunk = (<Buffer>chunk).toString(encoding);
296 chunks.push(<string>chunk);
297 if(typeof(encoding) == 'function')
298 cb = <Function>encoding;
299 if(cb)
300 cb();
301 };
302
303
304 const self = this;
305 let alt_host = this.options.altHost;
306 justAnObject.end = function(chunk?: any, encoding?: any, cb?: any) {
307 if(typeof(encoding) == 'function'){
308 cb = <Function>encoding;
309 encoding = void(0);
310 }
311
312 check_head();
313 if (!is_html) {
314 if (cache_only)
315 return;
316 debug('flush original');
317 restore_override();
318 return res.end.apply(res, arguments);
319 }
320
321 if (chunk != null)
322 res.write.apply(res, arguments);
323
324 let original_html = chunks.join('');
325 res.setHeader('Content-Language', req.bablic.locale);
326 if (replaceUrls) {
327 restore_override();
328
329 // detect that URL is of sitemap and is XML (res content type).If XML, then try to parse XML. And go over all
330 if (lastModified && lastModified[req.bablic.locale] && /sitemap/i.test(req.url) &&
331 self.readHeaderAsString(res, 'content-type').indexOf('xml') > -1){
332
333 const bablicDate = new Date(lastModified[req.bablic.locale]);
334 original_html = original_html.replace(new RegExp("<lastmod>(.*?)</lastmod>", "g"), (captureAll, dateCapture) => {
335 let siteMapDate = new Date(dateCapture);
336 if (siteMapDate < bablicDate) {
337 return "<lastmod>" + bablicDate.toISOString() + "</lastmod>";
338 } else {
339 return captureAll;
340 }
341 });
342 }
343
344 const locale = req.bablic.locale;
345 const currentHost = req.headers.host as string;
346 let originalDomains: string[] = [currentHost];
347 if(alt_host)
348 originalDomains.push(alt_host);
349 if (meta.localeDetection === "custom" && meta.customUrls && meta.customUrls[locale]) {
350 if(currentHost === meta.customUrls[locale]) {
351 let supposeOriginDomain = meta.customUrls[meta.original];
352 if (supposeOriginDomain) {
353 originalDomains.push(supposeOriginDomain);
354 }
355 }
356 }
357
358 html = original_html.replace(detect_url, url => {
359 if (ignore_not_html_or_xml.test(url))
360 return url;
361 if (_.every(originalDomains, (domain) => !url.includes(domain))) {
362 return url;
363 }
364
365 let parsed = UrlParser.parse(url);
366 // translate URLs in sitemaps and such
367 if(keywordsByLocale && keywordsByLocale[req.bablic.locale]){
368 let keywords = keywordsByLocale[req.bablic.locale];
369 parsed.pathname = parsed.pathname.split('/').map(part => keywords[part] || part).join('/');
370 }
371 return getLink(req.bablic.locale, parsed, meta, self.subDirOptions);
372 });
373 if (res.getHeader('Transfer-Encoding') !== 'chunked') {
374 res.setHeader('Content-Length', Buffer.byteLength(html));
375 }
376 res.write(html, cb);
377 return res.end();
378 }
379
380
381 self.getHtml(my_url, req.bablic.locale, original_html).then((data) => {
382 if (cache_only)
383 return;
384
385 const isEncoded = self.isEncoded(data);
386 // if browser doesnt support gzip encoding
387 if (!acceptGZIP) {
388 // if the content is gzipped
389 if (isEncoded) {
390 data = zlib.gunzipSync(data);
391 }
392 }else if (isEncoded) {
393 res.setHeader('Content-Encoding', 'gzip');
394 }
395
396 restore_override();
397 debug('flushing translated');
398 if (res.getHeader('Transfer-Encoding') !== 'chunked') {
399 res.setHeader('Content-Length', Buffer.byteLength(data));
400 }
401 res.write(data, cb);
402 res.end();
403 }, (error) => {
404 if (cache_only)
405 return;
406 restore_override();
407 console.error('[Bablic SDK] Error:', my_url, error);
408 debug('flushing original');
409 res.write(original_html, cb);
410 res.end();
411 });
412 };
413 return next();
414 });
415 };
416 }
417}
418
419
420const ignore_not_html_or_xml = /\.(js|css|jpg|jpeg|png|ico|mp4|wmv|ogg|mp3|avi|mpeg|bmp|wav|pdf|doc|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
421
422const detect_url = /(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
423
424let SEO_ROOT = 'http://seo.bablic.com/api/engine/seo';
425
426export function setRenderServer(url: string) {
427 if (!url) {
428 throw new Error("Must be a valid URL");
429 }
430 SEO_ROOT = url;
431}
432
433function hash(data){
434 return crypto.createHash('md5').update(data).digest('hex');
435}
436function fullPathFromUrl(url: string, locale: string, cacheDir: string) {
437 return cacheDir + "/" + locale + "/" + hash(url);
438}
439function getCacheDir(locale: string, cacheDir: string) {
440 return cacheDir + "/" + locale;
441}
442function cacheValid(file_stats: Stats, cacheDays: number) {
443 let last_modified = moment(file_stats.mtime.getTime());
444 let now = moment();
445 last_modified.add(cacheDays, 'days');
446 return now.isBefore(last_modified);
447}
448
449const filename_tester = /\.(js|css|jpg|jpeg|png|mp3|avi|mpeg|bmp|wav|pdf|doc|xml|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
450function ignorable(req) {
451 return filename_tester.test(req.url);
452}
453const google_tester = /bot|crawler|yandex|bing|baidu|spider|facebook|twitter|80legs|google|seo/i;
454function isBot(req) {
455 return google_tester.test(req.headers['user-agent']);
456}
457
458function shouldHandle(req) {
459 return isBot(req) && !ignorable(req);
460}
461
462function shouldReplaceUrls(req) {
463 return /sitemap|robots/i.test(req.url);
464}
465
466
467function renderHealthCheck(): Promise<boolean> {
468 return new Promise<boolean>((resolve, reject) => {
469 debug('render health check');
470 request({
471 url: SEO_ROOT,
472 headers:{
473 "Accept-Encoding": "gzip,deflate"
474 },
475 method: 'GET',
476 timeout: 10000,
477 }, (error:any) => {
478 if (error) {
479 debug('render is not healthy', error);
480 return resolve(false);
481 }
482 debug('render is healthy');
483 resolve(true);
484 });
485 });
486}
487
488let isRenderHealthy = true;
489
490setInterval(() => {
491 renderHealthCheck().then((health) => {
492 isRenderHealthy = health;
493 });
494}, 1000*60);
495
\No newline at end of file