1 | var htmlparser = require('htmlparser2'),
|
2 | voidElements = [
|
3 | 'area',
|
4 | 'base',
|
5 | 'basefont',
|
6 | 'br',
|
7 | 'col',
|
8 | 'command',
|
9 | 'embed',
|
10 | 'frame',
|
11 | 'hr',
|
12 | 'img',
|
13 | 'input',
|
14 | 'isindex',
|
15 | 'keygen',
|
16 | 'link',
|
17 | 'meta',
|
18 | 'param',
|
19 | 'source',
|
20 | 'track',
|
21 | 'wbr',
|
22 |
|
23 |
|
24 | 'circle',
|
25 | 'ellipse',
|
26 | 'line',
|
27 | 'path',
|
28 | 'polygon',
|
29 | 'polyline',
|
30 | 'rect',
|
31 | 'stop',
|
32 | 'use'
|
33 | ],
|
34 | options = {};
|
35 |
|
36 | function setup(opt) {
|
37 | options = {
|
38 | 'break-around-comments': true,
|
39 | 'break-around-tags': [
|
40 | 'blockquote',
|
41 | 'body',
|
42 | 'br',
|
43 | 'div',
|
44 | 'h1',
|
45 | 'h2',
|
46 | 'h3',
|
47 | 'h4',
|
48 | 'h5',
|
49 | 'h6',
|
50 | 'head',
|
51 | 'hr',
|
52 | 'link',
|
53 | 'meta',
|
54 | 'p',
|
55 | 'script',
|
56 | 'style',
|
57 | 'table',
|
58 | 'td',
|
59 | 'title',
|
60 | 'tr'
|
61 | ],
|
62 | 'indent': ' ',
|
63 | 'remove-attributes': [
|
64 | 'align',
|
65 | 'bgcolor',
|
66 | 'border',
|
67 | 'cellpadding',
|
68 | 'cellspacing',
|
69 | 'color',
|
70 | 'height',
|
71 | 'target',
|
72 | 'valign',
|
73 | 'width'
|
74 | ],
|
75 | 'remove-comments': false,
|
76 | 'remove-empty-tags': [],
|
77 | 'remove-tags': [
|
78 | 'center',
|
79 | 'font'
|
80 | ],
|
81 | 'replace-nbsp': false,
|
82 | 'wrap': 120
|
83 | };
|
84 |
|
85 | if (!opt) {
|
86 | return;
|
87 | }
|
88 |
|
89 | options['break-around-comments'] = opt['break-around-comments'] === false ? false : true;
|
90 | options['break-around-tags'] = opt['break-around-tags'] || options['break-around-tags'];
|
91 | options['indent'] = opt['indent'] || options['indent'];
|
92 | options['remove-attributes'] = opt['remove-attributes'] || options['remove-attributes'];
|
93 | options['remove-comments'] = opt['remove-comments'] === true ? true : false;
|
94 | options['remove-empty-tags'] = opt['remove-empty-tags'] || options['remove-empty-tags'];
|
95 | options['remove-tags'] = opt['remove-tags'] || options['remove-tags'];
|
96 | options['replace-nbsp'] = opt['replace-nbsp'] === true ? true : false;
|
97 | options['wrap'] = opt['wrap'] >= 0 ? opt['wrap'] : options['wrap'];
|
98 |
|
99 | if (opt['add-break-around-tags']) {
|
100 | options['break-around-tags'] = options['break-around-tags'].concat(opt['add-break-around-tags']);
|
101 | }
|
102 |
|
103 | if (opt['add-remove-attributes']) {
|
104 | options['remove-attributes'] = options['remove-attributes'].concat(opt['add-remove-attributes']);
|
105 | }
|
106 |
|
107 | if (opt['add-remove-tags']) {
|
108 | options['remove-tags'] = options['remove-tags'].concat(opt['add-remove-tags']);
|
109 | }
|
110 | }
|
111 |
|
112 | function breakAround(node) {
|
113 | if (shouldRemove(node)) {
|
114 | return false;
|
115 | }
|
116 |
|
117 | if (node.type == 'text') {
|
118 | return false;
|
119 | }
|
120 |
|
121 | if (node.type == 'comment') {
|
122 | return options['break-around-comments'];
|
123 | }
|
124 |
|
125 | if (options['break-around-tags'].indexOf(node.name) != -1) {
|
126 | return true;
|
127 | }
|
128 |
|
129 | return breakWithin(node);
|
130 | }
|
131 |
|
132 | function breakWithin(node) {
|
133 | if (shouldRemove(node)) {
|
134 | return false;
|
135 | }
|
136 |
|
137 | if (node.type != 'tag') {
|
138 | return false;
|
139 | }
|
140 |
|
141 | return node.children.some(breakAround) || node.children.some(breakWithin);
|
142 | }
|
143 |
|
144 | function isEmpty(node) {
|
145 | if (node.type == 'text') {
|
146 | if (options['replace-nbsp']) {
|
147 | !node.data.replace(/ /g, ' ').trim();
|
148 | }
|
149 |
|
150 | return !node.data.trim();
|
151 | }
|
152 |
|
153 | if (node.type == 'comment') {
|
154 | return !node.data.trim();
|
155 | }
|
156 |
|
157 | return !node.children.length || node.children.every(isEmpty);
|
158 | }
|
159 |
|
160 | function shouldRemove(node) {
|
161 | if (node.type == 'text') {
|
162 | return isEmpty(node);
|
163 | }
|
164 |
|
165 | if (node.type == 'comment') {
|
166 | return options['remove-comments'] || isEmpty(node);
|
167 | }
|
168 |
|
169 | if (options['remove-empty-tags'].indexOf(node.name) != -1) {
|
170 | return isEmpty(node);
|
171 | }
|
172 |
|
173 | return options['remove-tags'].indexOf(node.name) != -1;
|
174 | }
|
175 |
|
176 | function renderText(node) {
|
177 | if (shouldRemove(node)) {
|
178 | return '';
|
179 | }
|
180 |
|
181 | var text = node.data;
|
182 |
|
183 | if (node.parent && (node.parent.type == 'script' || node.parent.type == 'style')) {
|
184 | return text;
|
185 | }
|
186 |
|
187 | if (options['replace-nbsp']) {
|
188 | text = text.replace(/ /g, ' ');
|
189 | }
|
190 |
|
191 | if (!node.prev || breakAround(node.prev)) {
|
192 | text = text.trimLeft();
|
193 | }
|
194 |
|
195 | if (!node.next || breakAround(node.next)) {
|
196 | text = text.trimRight();
|
197 | }
|
198 |
|
199 |
|
200 | return text.replace(/\s+/g, ' ');
|
201 | }
|
202 |
|
203 | function renderComment(node) {
|
204 | if (shouldRemove(node)) {
|
205 | return '';
|
206 | }
|
207 |
|
208 | var comment = '<!--' + node.data + '-->';
|
209 |
|
210 | if (breakAround(node)) {
|
211 | return '\n' + comment + '\n';
|
212 | }
|
213 |
|
214 | return comment;
|
215 | }
|
216 |
|
217 | function renderTag(node) {
|
218 | if (shouldRemove(node)) {
|
219 | if (isEmpty(node)) {
|
220 | return '';
|
221 | }
|
222 |
|
223 | return render(node.children);
|
224 | }
|
225 |
|
226 | var openTag = '<' + node.name;
|
227 |
|
228 | for (var attrib in node.attribs) {
|
229 | if (options['remove-attributes'].indexOf(attrib) == -1) {
|
230 | openTag += ' ' + attrib + '="' + node.attribs[attrib] + '"';
|
231 | }
|
232 | }
|
233 |
|
234 | openTag += '>';
|
235 |
|
236 | if (voidElements.indexOf(node.name) != -1) {
|
237 | if (breakAround(node)) {
|
238 | return '\n' + openTag + '\n';
|
239 | }
|
240 |
|
241 | return openTag;
|
242 | }
|
243 |
|
244 | var closeTag = '</' + node.name + '>';
|
245 |
|
246 | if (breakAround(node)) {
|
247 | openTag = '\n' + openTag;
|
248 | closeTag = closeTag + '\n';
|
249 | }
|
250 |
|
251 | if (breakWithin(node)) {
|
252 | openTag = openTag + '\n';
|
253 | closeTag = '\n' + closeTag;
|
254 | }
|
255 |
|
256 | return openTag + render(node.children) + closeTag;
|
257 | }
|
258 |
|
259 | function renderDirective(node) {
|
260 | return '<' + node.data + '>';
|
261 | }
|
262 |
|
263 | function render(nodes) {
|
264 | var html = '';
|
265 |
|
266 | nodes.forEach(function (node) {
|
267 | if (node.type == 'root') {
|
268 | html += render(node.children);
|
269 | return;
|
270 | }
|
271 |
|
272 | if (node.type == 'text') {
|
273 | html += renderText(node);
|
274 | return;
|
275 | }
|
276 |
|
277 | if (node.type == 'comment') {
|
278 | html += renderComment(node);
|
279 | return;
|
280 | }
|
281 |
|
282 | if (node.type == 'directive') {
|
283 | html += renderDirective(node)
|
284 | return;
|
285 | }
|
286 |
|
287 | html += renderTag(node);
|
288 | });
|
289 |
|
290 |
|
291 | html = html.replace(/\n+/g, '\n');
|
292 |
|
293 | return html;
|
294 | }
|
295 |
|
296 | function getIndent(indentLevel) {
|
297 | var indent = '';
|
298 |
|
299 | for (var i = 0; i < indentLevel; i++) {
|
300 | indent += options['indent'];
|
301 | }
|
302 |
|
303 | return indent;
|
304 | }
|
305 |
|
306 | function wrap(line, indent) {
|
307 | var bound = line.lastIndexOf(' ', options['wrap']);
|
308 |
|
309 | if (bound == -1) {
|
310 | bound = line.indexOf(' ', options['wrap']);
|
311 |
|
312 | if (bound == -1) {
|
313 | return line;
|
314 | }
|
315 | }
|
316 |
|
317 | var line1 = line.substr(0, bound),
|
318 | line2 = indent + line.substr(bound + 1);
|
319 |
|
320 | if (line2.length > options['wrap']) {
|
321 | line2 = wrap(line2, indent);
|
322 | }
|
323 |
|
324 | return line1 + '\n' + line2;
|
325 | }
|
326 |
|
327 | function indent(html) {
|
328 | var indentLevel = 0;
|
329 |
|
330 | return html.replace(/.*\n/g, function (line) {
|
331 | var openTags = [],
|
332 | tagRegEx = /<\/?(\w+).*?>/g,
|
333 | tag,
|
334 | tagName,
|
335 | result;
|
336 |
|
337 | while (result = tagRegEx.exec(line)) {
|
338 | tag = result[0];
|
339 | tagName = result[1];
|
340 |
|
341 | if (voidElements.indexOf(tagName) != -1) {
|
342 | continue;
|
343 | }
|
344 |
|
345 | if (tag.indexOf('</') == -1) {
|
346 | openTags.push(tag);
|
347 | indentLevel++;
|
348 | } else {
|
349 | openTags.pop();
|
350 | indentLevel--;
|
351 | }
|
352 | }
|
353 |
|
354 | var indent = getIndent(indentLevel - openTags.length);
|
355 |
|
356 | line = indent + line;
|
357 |
|
358 | if (options['wrap'] && line.length > options['wrap']) {
|
359 | line = wrap(line, indent);
|
360 | }
|
361 |
|
362 | return line;
|
363 | });
|
364 | }
|
365 |
|
366 | function clean(html, opt, callback) {
|
367 | if (typeof opt == 'function') {
|
368 | callback = opt;
|
369 | opt = null;
|
370 | }
|
371 |
|
372 | setup(opt);
|
373 |
|
374 | var handler = new htmlparser.DomHandler(function (err, dom) {
|
375 | if (err) {
|
376 | throw err;
|
377 | }
|
378 |
|
379 | var html = render(dom);
|
380 | html = indent(html).trim();
|
381 |
|
382 | callback(html);
|
383 | });
|
384 |
|
385 | var parser = new htmlparser.Parser(handler);
|
386 | parser.write(html);
|
387 | parser.done();
|
388 | }
|
389 |
|
390 | module.exports = {
|
391 | clean: clean
|
392 | };
|