1 |
|
2 |
|
3 | import * as async from 'async';
|
4 | import * as crypto from 'crypto';
|
5 | import * as fs from 'fs';
|
6 | import {emptyDir, ensureDir, rmdir, writeFile} from "fs-extra";
|
7 | import * as moment from 'moment';
|
8 | import * as OS from 'os';
|
9 | import * as request from 'request';
|
10 | import * as Debug from 'debug';
|
11 | import * as UrlParser from 'url';
|
12 | import * as mkdirp from 'mkdirp';
|
13 |
|
14 | const debug = Debug('bablic:seo');
|
15 | const zlib = require('zlib');
|
16 |
|
17 | import {
|
18 | ExtendedRequest, ExtendedResponse, Middleware, getLink, KeywordMapper, SiteMeta, LastModifiedByLocale,
|
19 | BablicLinkOptions
|
20 | } from "./common";
|
21 | import {OutgoingMessage, ServerResponse} from "http";
|
22 | import {Stats} from "fs";
|
23 | import {RequestResponse} from "request";
|
24 | import _ = require("lodash");
|
25 |
|
26 | export interface SeoOptions {
|
27 | useCache?:boolean;
|
28 | defaultCache?:string[];
|
29 | cacheDir?: string;
|
30 | test?:boolean;
|
31 | altHost?: string;
|
32 | cacheDays?: number;
|
33 | }
|
34 |
|
35 | export interface SeoSubDirOptions {
|
36 | subDir: boolean;
|
37 | subDirBase: string;
|
38 | subDirOptional: boolean;
|
39 | }
|
40 |
|
41 | export class SeoMiddleware{
|
42 | private subDirOptions: BablicLinkOptions;
|
43 | constructor(private siteId: string, private options: SeoOptions, subDirOptions: SeoSubDirOptions){
|
44 | this.subDirOptions = Object.assign({returnFull: true}, subDirOptions);
|
45 | }
|
46 | async writeToCache(url: string, locale: string, translated: string): Promise<void> {
|
47 | let cachePath = fullPathFromUrl(url, locale, this.options.cacheDir);
|
48 | try {
|
49 |
|
50 | await writeFile(cachePath, translated);
|
51 | } catch (e) {
|
52 | const cacheDir = getCacheDir(locale, this.options.cacheDir);
|
53 | debug("create cache dir", cacheDir);
|
54 | await ensureDir(cacheDir);
|
55 | debug("created");
|
56 | await writeFile(cachePath, translated);
|
57 | }
|
58 | }
|
59 | getHtml(url: string, locale: string, html?: string): Promise<string> {
|
60 | if (!isRenderHealthy) {
|
61 | return Promise.reject(new Error("Render is not health"));
|
62 | }
|
63 | debug('getting from bablic', url, 'html:', !!html );
|
64 | let ld = '';
|
65 | if(this.subDirOptions.subDir) {
|
66 | ld = '&ld=subdir';
|
67 | if(this.subDirOptions.subDirBase)
|
68 | ld += '&sdb=' + encodeURIComponent(this.subDirOptions.subDirBase);
|
69 | if(this.subDirOptions.subDirOptional)
|
70 | ld += '&sdo=true';
|
71 | }
|
72 | return new Promise<string>((resolve, reject) => {
|
73 | request({
|
74 | url: SEO_ROOT + "?site=" + this.siteId + "&el=" + locale + "&url=" + (encodeURIComponent(url)) + ld,
|
75 | headers:{
|
76 | "Accept-Encoding": "gzip,deflate"
|
77 | },
|
78 | method: 'POST',
|
79 | json: {
|
80 | html: html
|
81 | },
|
82 | timeout: 20000,
|
83 | encoding:null,
|
84 | }, (error:any, response:RequestResponse, body: any) => {
|
85 | if (error)
|
86 | return reject(error);
|
87 |
|
88 | if (response.statusCode < 200 || response.statusCode >= 300)
|
89 | return reject(new Error("Status-" + response.statusCode));
|
90 |
|
91 | if (body == null)
|
92 | return reject(new Error('empty response'));
|
93 |
|
94 | debug('received translated html', response.statusCode);
|
95 | resolve(body);
|
96 |
|
97 | this.writeToCache(url, locale, body).catch((e) => {
|
98 | debug("error writing to cache", e);
|
99 | });
|
100 | });
|
101 | });
|
102 | }
|
103 | getFromCache(url: string, locale: string, skip: boolean, callback:(e?:Error, html?: Buffer | string, isValid?: boolean) => void) {
|
104 | if (!this.options.useCache || skip)
|
105 | return callback();
|
106 |
|
107 | let file_path = fullPathFromUrl(url, locale, this.options.cacheDir);
|
108 | fs.stat(file_path, (error:NodeJS.ErrnoException, file_stats: Stats) => {
|
109 | if (error)
|
110 | return callback(error);
|
111 |
|
112 | fs.readFile(file_path, (error:NodeJS.ErrnoException, data: Buffer) => {
|
113 | if (error)
|
114 | return callback(error);
|
115 |
|
116 | callback(error, data, cacheValid(file_stats, this.options.cacheDays || 1));
|
117 | });
|
118 | });
|
119 | };
|
120 | isEncoded(buffer) {
|
121 | try {
|
122 | // every gzip content start with 0x1f8b 2 bytes
|
123 | let firstByte = buffer[0];
|
124 | let secondByte = buffer[1];
|
125 | return (firstByte == 0x1f) && (secondByte == 0x8b)
|
126 | } catch (err) {
|
127 | return false;
|
128 | }
|
129 | }
|
130 | readHeaderAsString(res: ExtendedResponse, headerName: string): string {
|
131 | let value = res.getHeader(headerName);
|
132 | if (!value)
|
133 | return "";
|
134 | if (Array.isArray(value)) {
|
135 | value = value[0];
|
136 | }
|
137 | if (typeof(value) !== "string") {
|
138 | return value + "";
|
139 | } else {
|
140 | return value;
|
141 | }
|
142 |
|
143 | }
|
144 | async purgeCache(): Promise<void> {
|
145 | debug("purge cache", this.options.cacheDir);
|
146 | await rmdir(this.options.cacheDir);
|
147 | debug("purge done");
|
148 | }
|
149 | middleware(){
|
150 | return (meta:SiteMeta,lastModified:LastModifiedByLocale, keywordsByLocale: KeywordMapper, reverseKeywordByLocale: KeywordMapper, req: ExtendedRequest, res: ExtendedResponse, next: () => void) => {
|
151 |
|
152 | let replaceUrls = shouldReplaceUrls(req);
|
153 | if (!shouldHandle(req) && !replaceUrls) {
|
154 | debug('ignored', req.url);
|
155 | return next();
|
156 | }
|
157 |
|
158 | let acceptGZIP = (req.headers['accept-encoding'] || '').indexOf('gzip') > -1;
|
159 |
|
160 | delete req.headers['accept-encoding'];
|
161 | req.bablic.proxied = true;
|
162 |
|
163 | let protocol = req.headers['x-forwarded-proto'] || 'http';
|
164 | let my_url = protocol + "://" + req.headers.host + req.originalUrl;
|
165 | if (this.options.altHost)
|
166 | my_url = "http://" + this.options.altHost + req.originalUrl;
|
167 |
|
168 |
|
169 | this.getFromCache(my_url, req.bablic.locale, replaceUrls, (e, html, isValid) => {
|
170 | let cache_only = false;
|
171 | if (html) {
|
172 | debug('flushing from cache');
|
173 | res.setHeader('Content-Type', 'text/html; charset=utf-8');
|
174 | res.setHeader('Content-Language', req.bablic.locale);
|
175 |
|
176 | const encoded = this.isEncoded(html);
|
177 |
|
178 | if (acceptGZIP) {
|
179 |
|
180 | if (encoded) {
|
181 | res.setHeader('Content-Encoding', 'gzip');
|
182 | }
|
183 | }else{
|
184 |
|
185 | if (encoded) {
|
186 | html = zlib.gunzipSync(html);
|
187 | }
|
188 | }
|
189 |
|
190 | res.write(html);
|
191 | res.end();
|
192 |
|
193 | if (isValid)
|
194 | return;
|
195 | cache_only = true;
|
196 | }
|
197 |
|
198 | if (!isRenderHealthy && !replaceUrls) {
|
199 | debug('render not healthy, skipping');
|
200 | return next();
|
201 | }
|
202 |
|
203 |
|
204 | debug('overriding response');
|
205 | let _end = res.end;
|
206 | let _write = res.write;
|
207 | let _writeHead = res.writeHead;
|
208 |
|
209 | res.writeHead = (status, _headers) => {
|
210 | res.statusCode = status;
|
211 | if (_headers && typeof _headers === 'object') {
|
212 | let results = [];
|
213 | for (let key in _headers)
|
214 | results.push(res.setHeader(key, _headers[key]));
|
215 | return results;
|
216 | }
|
217 | };
|
218 | let headers = {};
|
219 | let _getHeader;
|
220 | if (cache_only) {
|
221 | _getHeader = res.getHeader;
|
222 |
|
223 | res.finished = false;
|
224 | Object.defineProperty(res,"headersSent",{
|
225 | get:()=>{
|
226 | return false;
|
227 | },
|
228 | configurable:true,
|
229 | enumerable:true,
|
230 | });
|
231 |
|
232 | res.setHeader = (name, value) => headers[name.toLowerCase().trim()] = value;
|
233 | res.removeHeader = name => headers[name.toLowerCase().trim()] = null;
|
234 | res.getHeader = name => {
|
235 | let local = headers[name.toLowerCase().trim()];
|
236 | if (local)
|
237 | return local;
|
238 | if (local === null)
|
239 | return;
|
240 | return _getHeader.call(res, name);
|
241 | };
|
242 | }
|
243 | let restore_override = () => {
|
244 | if (!_write || !_end || !_writeHead)
|
245 | return;
|
246 | debug('undo override');
|
247 | res.write = _write;
|
248 | res.end = _end;
|
249 | res.writeHead = _writeHead;
|
250 |
|
251 | if (cache_only) {
|
252 | _getHeader = null;
|
253 | const getter = Object.getOwnPropertyDescriptor(OutgoingMessage.prototype,"headersSent");
|
254 | Object.defineProperty(res,"headersSent",getter );
|
255 | }
|
256 |
|
257 | _write = _end = _writeHead = null;
|
258 | };
|
259 |
|
260 |
|
261 | let head_checked = false;
|
262 | let is_html = null;
|
263 | let chunks = [];
|
264 | let check_head = () => {
|
265 | if (head_checked)
|
266 | return;
|
267 |
|
268 | const ct = this.readHeaderAsString(res, 'content-type');
|
269 | is_html = ct.indexOf('text/html') > -1 || replaceUrls;
|
270 |
|
271 | if (!is_html) {
|
272 | debug('not html', ct);
|
273 | restore_override();
|
274 | }
|
275 | if (res.statusCode < 200 || res.statusCode >= 300) {
|
276 | debug('error response', res.statusCode);
|
277 | is_html = false;
|
278 | restore_override();
|
279 | }
|
280 | head_checked = true;
|
281 | };
|
282 |
|
283 |
|
284 | let justAnObject: any = <any>res;
|
285 | res.write = function(chunk?: any, encoding?: any, cb?: any) {
|
286 | check_head();
|
287 | if (!is_html) {
|
288 | if (cache_only)
|
289 | return;
|
290 |
|
291 | debug('write original');
|
292 | return res.write.apply(res, arguments);
|
293 | }
|
294 | if (chunk instanceof Buffer)
|
295 | chunk = (<Buffer>chunk).toString(encoding);
|
296 | chunks.push(<string>chunk);
|
297 | if(typeof(encoding) == 'function')
|
298 | cb = <Function>encoding;
|
299 | if(cb)
|
300 | cb();
|
301 | };
|
302 |
|
303 |
|
304 | const self = this;
|
305 | let alt_host = this.options.altHost;
|
306 | justAnObject.end = function(chunk?: any, encoding?: any, cb?: any) {
|
307 | if(typeof(encoding) == 'function'){
|
308 | cb = <Function>encoding;
|
309 | encoding = void(0);
|
310 | }
|
311 |
|
312 | check_head();
|
313 | if (!is_html) {
|
314 | if (cache_only)
|
315 | return;
|
316 | debug('flush original');
|
317 | restore_override();
|
318 | return res.end.apply(res, arguments);
|
319 | }
|
320 |
|
321 | if (chunk != null)
|
322 | res.write.apply(res, arguments);
|
323 |
|
324 | let original_html = chunks.join('');
|
325 | res.setHeader('Content-Language', req.bablic.locale);
|
326 | if (replaceUrls) {
|
327 | restore_override();
|
328 |
|
329 |
|
330 | if (lastModified && lastModified[req.bablic.locale] && /sitemap/i.test(req.url) &&
|
331 | self.readHeaderAsString(res, 'content-type').indexOf('xml') > -1){
|
332 |
|
333 | const bablicDate = new Date(lastModified[req.bablic.locale]);
|
334 | original_html = original_html.replace(new RegExp("<lastmod>(.*?)</lastmod>", "g"), (captureAll, dateCapture) => {
|
335 | let siteMapDate = new Date(dateCapture);
|
336 | if (siteMapDate < bablicDate) {
|
337 | return "<lastmod>" + bablicDate.toISOString() + "</lastmod>";
|
338 | } else {
|
339 | return captureAll;
|
340 | }
|
341 | });
|
342 | }
|
343 |
|
344 | const locale = req.bablic.locale;
|
345 | const currentHost = req.headers.host as string;
|
346 | let originalDomains: string[] = [currentHost];
|
347 | if(alt_host)
|
348 | originalDomains.push(alt_host);
|
349 | if (meta.localeDetection === "custom" && meta.customUrls && meta.customUrls[locale]) {
|
350 | if(currentHost === meta.customUrls[locale]) {
|
351 | let supposeOriginDomain = meta.customUrls[meta.original];
|
352 | if (supposeOriginDomain) {
|
353 | originalDomains.push(supposeOriginDomain);
|
354 | }
|
355 | }
|
356 | }
|
357 |
|
358 | html = original_html.replace(detect_url, url => {
|
359 | if (ignore_not_html_or_xml.test(url))
|
360 | return url;
|
361 | if (_.every(originalDomains, (domain) => !url.includes(domain))) {
|
362 | return url;
|
363 | }
|
364 |
|
365 | let parsed = UrlParser.parse(url);
|
366 |
|
367 | if(keywordsByLocale && keywordsByLocale[req.bablic.locale]){
|
368 | let keywords = keywordsByLocale[req.bablic.locale];
|
369 | parsed.pathname = parsed.pathname.split('/').map(part => keywords[part] || part).join('/');
|
370 | }
|
371 | return getLink(req.bablic.locale, parsed, meta, self.subDirOptions);
|
372 | });
|
373 | if (res.getHeader('Transfer-Encoding') !== 'chunked') {
|
374 | res.setHeader('Content-Length', Buffer.byteLength(html));
|
375 | }
|
376 | res.write(html, cb);
|
377 | return res.end();
|
378 | }
|
379 |
|
380 |
|
381 | self.getHtml(my_url, req.bablic.locale, original_html).then((data) => {
|
382 | if (cache_only)
|
383 | return;
|
384 |
|
385 | const isEncoded = self.isEncoded(data);
|
386 |
|
387 | if (!acceptGZIP) {
|
388 |
|
389 | if (isEncoded) {
|
390 | data = zlib.gunzipSync(data);
|
391 | }
|
392 | }else if (isEncoded) {
|
393 | res.setHeader('Content-Encoding', 'gzip');
|
394 | }
|
395 |
|
396 | restore_override();
|
397 | debug('flushing translated');
|
398 | if (res.getHeader('Transfer-Encoding') !== 'chunked') {
|
399 | res.setHeader('Content-Length', Buffer.byteLength(data));
|
400 | }
|
401 | res.write(data, cb);
|
402 | res.end();
|
403 | }, (error) => {
|
404 | if (cache_only)
|
405 | return;
|
406 | restore_override();
|
407 | console.error('[Bablic SDK] Error:', my_url, error);
|
408 | debug('flushing original');
|
409 | res.write(original_html, cb);
|
410 | res.end();
|
411 | });
|
412 | };
|
413 | return next();
|
414 | });
|
415 | };
|
416 | }
|
417 | }
|
418 |
|
419 |
|
420 | const ignore_not_html_or_xml = /\.(js|css|jpg|jpeg|png|ico|mp4|wmv|ogg|mp3|avi|mpeg|bmp|wav|pdf|doc|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
|
421 |
|
422 | const detect_url = /(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
|
423 |
|
424 | let SEO_ROOT = 'http://seo.bablic.com/api/engine/seo';
|
425 |
|
426 | export function setRenderServer(url: string) {
|
427 | if (!url) {
|
428 | throw new Error("Must be a valid URL");
|
429 | }
|
430 | SEO_ROOT = url;
|
431 | }
|
432 |
|
433 | function hash(data){
|
434 | return crypto.createHash('md5').update(data).digest('hex');
|
435 | }
|
436 | function fullPathFromUrl(url: string, locale: string, cacheDir: string) {
|
437 | return cacheDir + "/" + locale + "/" + hash(url);
|
438 | }
|
439 | function getCacheDir(locale: string, cacheDir: string) {
|
440 | return cacheDir + "/" + locale;
|
441 | }
|
442 | function cacheValid(file_stats: Stats, cacheDays: number) {
|
443 | let last_modified = moment(file_stats.mtime.getTime());
|
444 | let now = moment();
|
445 | last_modified.add(cacheDays, 'days');
|
446 | return now.isBefore(last_modified);
|
447 | }
|
448 |
|
449 | const filename_tester = /\.(js|css|jpg|jpeg|png|mp3|avi|mpeg|bmp|wav|pdf|doc|xml|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
|
450 | function ignorable(req) {
|
451 | return filename_tester.test(req.url);
|
452 | }
|
453 | const google_tester = /bot|crawler|yandex|bing|baidu|spider|facebook|twitter|80legs|google|seo/i;
|
454 | function isBot(req) {
|
455 | return google_tester.test(req.headers['user-agent']);
|
456 | }
|
457 |
|
458 | function shouldHandle(req) {
|
459 | return isBot(req) && !ignorable(req);
|
460 | }
|
461 |
|
462 | function shouldReplaceUrls(req) {
|
463 | return /sitemap|robots/i.test(req.url);
|
464 | }
|
465 |
|
466 |
|
467 | function renderHealthCheck(): Promise<boolean> {
|
468 | return new Promise<boolean>((resolve, reject) => {
|
469 | debug('render health check');
|
470 | request({
|
471 | url: SEO_ROOT,
|
472 | headers:{
|
473 | "Accept-Encoding": "gzip,deflate"
|
474 | },
|
475 | method: 'GET',
|
476 | timeout: 10000,
|
477 | }, (error:any) => {
|
478 | if (error) {
|
479 | debug('render is not healthy', error);
|
480 | return resolve(false);
|
481 | }
|
482 | debug('render is healthy');
|
483 | resolve(true);
|
484 | });
|
485 | });
|
486 | }
|
487 |
|
488 | let isRenderHealthy = true;
|
489 |
|
490 | setInterval(() => {
|
491 | renderHealthCheck().then((health) => {
|
492 | isRenderHealthy = health;
|
493 | });
|
494 | }, 1000*60);
|
495 |
|
\ | No newline at end of file |