1 |
|
2 |
|
3 | import * as async from 'async';
|
4 | import * as crypto from 'crypto';
|
5 | import * as fs from 'fs';
|
6 | import * as moment from 'moment';
|
7 | import * as OS from 'os';
|
8 | import * as request from 'request';
|
9 | import * as Debug from 'debug';
|
10 | import * as UrlParser from 'url';
|
11 |
|
12 | const debug = Debug('bablic:seo');
|
13 |
|
14 | import {ExtendedRequest, ExtendedResponse, Middleware, getLink, KeywordMapper, SiteMeta} from "./common";
|
15 | import {ServerResponse} from "http";
|
16 | import {Stats} from "fs";
|
17 | import {RequestResponse} from "request";
|
18 |
|
19 | export interface SeoOptions {
|
20 | useCache?:boolean;
|
21 | defaultCache?:string[];
|
22 | test?:boolean;
|
23 | altHost?: string;
|
24 | }
|
25 |
|
26 | export interface SeoSubDirOptions {
|
27 | subDir: boolean;
|
28 | subDirBase: string;
|
29 | subDirOptional: boolean;
|
30 | }
|
31 |
|
32 | export class SeoMiddleware{
|
33 | constructor(private siteId: string, private options: SeoOptions, private subDirOptions: SeoSubDirOptions){}
|
34 | getHtml(url: string, locale: string, html?: string): Promise<string> {
|
35 | debug('getting from bablic', url, 'html:', !!html );
|
36 | let ld = '';
|
37 | if(this.subDirOptions.subDir) {
|
38 | ld = '&ld=subdir';
|
39 | if(this.subDirOptions.subDirBase)
|
40 | ld += '&sdb=' + encodeURIComponent(this.subDirOptions.subDirBase);
|
41 | if(this.subDirOptions.subDirOptional)
|
42 | ld += '&sdo=true';
|
43 | }
|
44 | return new Promise<string>((resolve, reject) => {
|
45 | request({
|
46 | url: SEO_ROOT + "?site=" + this.siteId + "&el=" + locale + "&url=" + (encodeURIComponent(url)) + ld,
|
47 | method: 'POST',
|
48 | json: {
|
49 | html: html
|
50 | }
|
51 | }, (error:any, response:RequestResponse, body: any) => {
|
52 | if (error)
|
53 | return reject(error);
|
54 |
|
55 | if (response.statusCode < 200 || response.statusCode >= 300)
|
56 | return reject(new Error("Status-" + response.statusCode));
|
57 |
|
58 | if (body == null)
|
59 | return reject(new Error('empty response'));
|
60 |
|
61 | debug('received translated html', response.statusCode);
|
62 | resolve(body);
|
63 | fs.writeFile(fullPathFromUrl(url), body, error => error && console.error('Error saving to cache', error));
|
64 | });
|
65 | });
|
66 | }
|
67 | getFromCache(url: string, skip: boolean, callback:(e?:Error, html?: string, isValid?: boolean) => void) {
|
68 | if (!this.options.useCache || skip)
|
69 | return callback();
|
70 |
|
71 | let file_path = fullPathFromUrl(url);
|
72 | fs.stat(file_path, (error:NodeJS.ErrnoException, file_stats: Stats) => {
|
73 | if (error)
|
74 | return callback(error);
|
75 |
|
76 | fs.readFile(file_path, (error:NodeJS.ErrnoException, data: Buffer) => {
|
77 | if (error)
|
78 | return callback(error);
|
79 | callback(error, data.toString('utf8'), cacheValid(file_stats));
|
80 | });
|
81 | });
|
82 | };
|
83 |
|
84 | middleware(){
|
85 | return (meta:SiteMeta, keywordsByLocale: KeywordMapper, reverseKeywordByLocale: KeywordMapper, req: ExtendedRequest, res: ExtendedResponse, next: () => void) => {
|
86 |
|
87 | let replaceUrls = shouldReplaceUrls(req);
|
88 | if (!shouldHandle(req) && !replaceUrls) {
|
89 | debug('ignored', req.url);
|
90 | return next();
|
91 | }
|
92 |
|
93 | delete req.headers['accept-encoding'];
|
94 | req.bablic.proxied = true;
|
95 |
|
96 | let protocol = req.headers['x-forwarded-proto'] || 'http';
|
97 | let my_url = protocol + "://" + req.headers.host + req.originalUrl;
|
98 | if (this.options.altHost)
|
99 | my_url = "http://" + this.options.altHost + req.originalUrl;
|
100 |
|
101 |
|
102 | this.getFromCache(my_url, replaceUrls, (e, html, isValid) => {
|
103 | let cache_only = false;
|
104 | if (html) {
|
105 | debug('flushing from cache');
|
106 | res.setHeader('Content-Type', 'text/html; charset=utf-8');
|
107 | res.setHeader('Content-Language', req.bablic.locale);
|
108 | res.write(html);
|
109 | res.end();
|
110 | if (isValid)
|
111 | return;
|
112 | cache_only = true;
|
113 | }
|
114 |
|
115 | debug('overriding response');
|
116 | let _end = res.end;
|
117 | let _write = res.write;
|
118 | let _writeHead = res.writeHead;
|
119 |
|
120 | res.writeHead = (status, _headers) => {
|
121 | res.statusCode = status;
|
122 | if (_headers && typeof _headers === 'object') {
|
123 | let results = [];
|
124 | for (let key in _headers)
|
125 | results.push(res.setHeader(key, _headers[key]));
|
126 | return results;
|
127 | }
|
128 | };
|
129 | let headers = {};
|
130 | let _getHeader;
|
131 | if (cache_only) {
|
132 | _getHeader = res.getHeader;
|
133 | res.setHeader = (name, value) => headers[name.toLowerCase().trim()] = value;
|
134 | res.removeHeader = name => headers[name.toLowerCase().trim()] = null;
|
135 | res.getHeader = name => {
|
136 | let local = headers[name.toLowerCase().trim()];
|
137 | if (local)
|
138 | return local;
|
139 | if (local === null)
|
140 | return;
|
141 | return _getHeader.call(res, name);
|
142 | };
|
143 | }
|
144 | let restore_override = () => {
|
145 | if (!_write || !_end || !_writeHead)
|
146 | return;
|
147 | debug('undo override');
|
148 | res.write = _write;
|
149 | res.end = _end;
|
150 | res.writeHead = _writeHead;
|
151 | if (cache_only)
|
152 | _getHeader = null;
|
153 |
|
154 | _write = _end = _writeHead = null;
|
155 | };
|
156 |
|
157 |
|
158 | let head_checked = false;
|
159 | let is_html = null;
|
160 | let chunks = [];
|
161 | let check_head = () => {
|
162 | if (head_checked)
|
163 | return;
|
164 |
|
165 | is_html = false;
|
166 | if (typeof(res.getHeader('content-type')) !== 'undefined')
|
167 | is_html = ((<string>res.getHeader('content-type')).indexOf('text/html') > -1) || replaceUrls;
|
168 |
|
169 | if (!is_html) {
|
170 | debug('not html', res.getHeader('content-type'));
|
171 | restore_override();
|
172 | }
|
173 | if (res.statusCode < 200 || res.statusCode >= 300) {
|
174 | debug('error response', res.statusCode);
|
175 | is_html = false;
|
176 | restore_override();
|
177 | }
|
178 | head_checked = true;
|
179 | };
|
180 |
|
181 |
|
182 | let justAnObject: any = <any>res;
|
183 | res.write = function(chunk?: any, encoding?: any, cb?: any) {
|
184 | check_head();
|
185 | if (!is_html) {
|
186 | if (cache_only)
|
187 | return;
|
188 |
|
189 | debug('write original');
|
190 | return res.write.apply(res, arguments);
|
191 | }
|
192 | if (chunk instanceof Buffer)
|
193 | chunk = (<Buffer>chunk).toString(encoding);
|
194 | chunks.push(<string>chunk);
|
195 | if(typeof(encoding) == 'function')
|
196 | cb = <Function>encoding;
|
197 | if(cb)
|
198 | cb();
|
199 | };
|
200 |
|
201 |
|
202 | const self = this;
|
203 | let alt_host = this.options.altHost;
|
204 | justAnObject.end = function(chunk?: any, encoding?: any, cb?: any) {
|
205 | if(typeof(encoding) == 'function'){
|
206 | cb = <Function>encoding;
|
207 | encoding = void(0);
|
208 | }
|
209 |
|
210 | check_head();
|
211 | if (!is_html) {
|
212 | if (cache_only)
|
213 | return;
|
214 | debug('flush original');
|
215 | restore_override();
|
216 | return res.end.apply(res, arguments);
|
217 | }
|
218 |
|
219 | if (chunk != null)
|
220 | res.write.apply(res, arguments);
|
221 |
|
222 | let original_html = chunks.join('');
|
223 | res.setHeader('Content-Language', req.bablic.locale);
|
224 | if (replaceUrls) {
|
225 | restore_override();
|
226 | html = original_html.replace(detect_url, url => {
|
227 | if (ignore_not_html_or_xml.test(url))
|
228 | return url;
|
229 | if (url.indexOf(<string>req.headers.host) === -1 && (!alt_host || url.indexOf(alt_host) === -1))
|
230 | return url;
|
231 |
|
232 | let parsed = UrlParser.parse(url);
|
233 |
|
234 | if(keywordsByLocale && keywordsByLocale[req.bablic.locale]){
|
235 | let keywords = keywordsByLocale[req.bablic.locale];
|
236 | parsed.pathname = parsed.pathname.split('/').map(part => keywords[part] || part).join('/');
|
237 | }
|
238 | return getLink(req.bablic.locale, parsed, meta);
|
239 | });
|
240 | res.setHeader('Content-Length', Buffer.byteLength(html));
|
241 | res.write(html, cb);
|
242 | return res.end();
|
243 | }
|
244 | self.getHtml(my_url, req.bablic.locale, original_html).then((data) => {
|
245 | if (cache_only)
|
246 | return;
|
247 | restore_override();
|
248 | debug('flushing translated');
|
249 | res.setHeader('Content-Length', Buffer.byteLength(data));
|
250 | res.write(data, cb);
|
251 | res.end();
|
252 | }, (error) => {
|
253 | if (cache_only)
|
254 | return;
|
255 | restore_override();
|
256 | console.error('[Bablic SDK] Error:', error);
|
257 | debug('flushing original');
|
258 | res.write(original_html, cb);
|
259 | res.end();
|
260 | });
|
261 | };
|
262 | return next();
|
263 | });
|
264 | };
|
265 | }
|
266 | }
|
267 |
|
268 |
|
269 | const ignore_not_html_or_xml = /\.(js|css|jpg|jpeg|png|mp3|avi|mpeg|bmp|wav|pdf|doc|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
|
270 |
|
271 | const detect_url = /(\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|!:,.;]*[-A-Z0-9+&@#\/%=~_|])/ig;
|
272 |
|
273 | let SEO_ROOT = 'http://seo.bablic.com/api/engine/seo';
|
274 |
|
275 | export function setRenderServer(url: string) {
|
276 | if (!url) {
|
277 | throw new Error("Must be a valid URL");
|
278 | }
|
279 | SEO_ROOT = url;
|
280 | }
|
281 |
|
282 | function hash(data){
|
283 | return crypto.createHash('md5').update(data).digest('hex');
|
284 | }
|
285 |
|
286 | function fullPathFromUrl(url) {
|
287 | return OS.tmpdir() + "/" + hash(url);
|
288 | }
|
289 | function cacheValid(file_stats) {
|
290 | let last_modified = moment(file_stats.mtime.getTime());
|
291 | let now = moment();
|
292 | last_modified.add(30, 'minutes');
|
293 | return now.isBefore(last_modified);
|
294 | }
|
295 |
|
296 | const filename_tester = /\.(js|css|jpg|jpeg|png|mp3|avi|mpeg|bmp|wav|pdf|doc|xml|docx|xlsx|xls|json|kml|svg|eot|woff|woff2)/i;
|
297 | function ignorable(req) {
|
298 | return filename_tester.test(req.url);
|
299 | }
|
300 | const google_tester = /bot|crawler|baiduspider|facebook|twitter|80legs|google|seo/i;
|
301 | function isBot(req) {
|
302 | return google_tester.test(req.headers['user-agent']);
|
303 | }
|
304 |
|
305 | function shouldHandle(req) {
|
306 | return isBot(req) && !ignorable(req);
|
307 | }
|
308 |
|
309 | function shouldReplaceUrls(req) {
|
310 | return /sitemap|robots/i.test(req.url);
|
311 | }
|
312 |
|