1 | var htmlparser = require('htmlparser2'),
|
2 | options = {};
|
3 |
|
4 | function setup(opt) {
|
5 | options = {
|
6 | 'attr-to-remove': [
|
7 | 'align',
|
8 | 'bgcolor',
|
9 | 'border',
|
10 | 'cellpadding',
|
11 | 'cellspacing',
|
12 | 'color',
|
13 | 'disabled',
|
14 | 'height',
|
15 | 'target',
|
16 | 'valign',
|
17 | 'width'
|
18 | ],
|
19 | 'block-tags': [
|
20 | 'blockquote',
|
21 | 'div',
|
22 | 'h1',
|
23 | 'h2',
|
24 | 'h3',
|
25 | 'h4',
|
26 | 'h5',
|
27 | 'h6',
|
28 | 'hr',
|
29 | 'p',
|
30 | 'table',
|
31 | 'td',
|
32 | 'tr'
|
33 | ],
|
34 | 'break-around-comments': true,
|
35 | 'break-after-br': true,
|
36 | 'empty-tags': [
|
37 | 'br',
|
38 | 'hr',
|
39 | 'img'
|
40 | ],
|
41 | 'indent': ' ',
|
42 | 'remove-comments': false,
|
43 | 'remove-empty-paras': false,
|
44 | 'replace-nbsp': false,
|
45 | 'tags-to-remove': [
|
46 | 'center',
|
47 | 'font'
|
48 | ]
|
49 | };
|
50 |
|
51 | if (!opt) {
|
52 | return;
|
53 | }
|
54 |
|
55 | options['attr-to-remove'] = opt['attr-to-remove'] || options['attr-to-remove'];
|
56 | options['block-tags'] = opt['block-tags'] || options['block-tags'];
|
57 | options['break-around-comments'] = opt['break-around-comments'] === false ? false : true;
|
58 | options['break-after-br'] = opt['break-after-br'] === false ? false : true;
|
59 | options['empty-tags'] = opt['empty-tags'] || options['empty-tags'];
|
60 | options['indent'] = opt['indent'] || options['indent'];
|
61 | options['remove-comments'] = opt['remove-comments'] === true ? true : false;
|
62 | options['remove-empty-paras'] = opt['remove-empty-paras'] === true ? true : false;
|
63 | options['replace-nbsp'] = opt['replace-nbsp'] === true ? true : false;
|
64 | options['tags-to-remove'] = opt['tags-to-remove'] || options['tags-to-remove'];
|
65 |
|
66 | if (opt['add-attr-to-remove']) {
|
67 | options['attr-to-remove'] = options['attr-to-remove'].concat(opt['add-attr-to-remove']);
|
68 | }
|
69 |
|
70 | if (opt['add-block-tags']) {
|
71 | options['block-tags'] = options['block-tags'].concat(opt['add-block-tags']);
|
72 | }
|
73 |
|
74 | if (opt['add-empty-tags']) {
|
75 | options['empty-tags'] = options['empty-tags'].concat(opt['add-empty-tags']);
|
76 | }
|
77 |
|
78 | if (opt['add-tags-to-remove']) {
|
79 | options['tags-to-remove'] = options['tags-to-remove'].concat(opt['add-tags-to-remove']);
|
80 | }
|
81 | }
|
82 |
|
83 | function isEmpty(node) {
|
84 | if (node.type == 'text' || node.type == 'comment') {
|
85 | return !node.data.trim();
|
86 | }
|
87 |
|
88 | return !node.children.length || node.children.every(isEmpty);
|
89 | }
|
90 |
|
91 | function renderText(node) {
|
92 | var text = node.data;
|
93 |
|
94 | if (options['replace-nbsp']) {
|
95 | text = text.replace(/ /g, ' ');
|
96 | }
|
97 |
|
98 |
|
99 | return text.replace(/\s+/g, ' ');
|
100 | }
|
101 |
|
102 | function renderComment(node) {
|
103 | if (options['remove-comments']) {
|
104 | return '';
|
105 | }
|
106 |
|
107 | var comment = '<!--' + node.data + '-->';
|
108 |
|
109 | if (options['break-around-comments']) {
|
110 | return '\n' + comment + '\n';
|
111 | }
|
112 |
|
113 | return comment;
|
114 | }
|
115 |
|
116 | function renderTag(node) {
|
117 | if (options['remove-empty-paras'] && node.name == 'p' && isEmpty(node)) {
|
118 | return '';
|
119 | }
|
120 |
|
121 | if (options['tags-to-remove'].indexOf(node.name) > -1) {
|
122 | if (!node.children.length) {
|
123 | return '';
|
124 | }
|
125 |
|
126 | return render(node.children);
|
127 | }
|
128 |
|
129 | var openTag = '<' + node.name;
|
130 |
|
131 | for (var attrib in node.attribs) {
|
132 | if (options['attr-to-remove'].indexOf(attrib) == -1) {
|
133 | openTag += ' ' + attrib + '="' + node.attribs[attrib] + '"';
|
134 | }
|
135 | }
|
136 |
|
137 | openTag += '>';
|
138 |
|
139 | if (options['empty-tags'].indexOf(node.name) > -1) {
|
140 | if (options['break-after-br'] && node.name == 'br') {
|
141 | return openTag + '\n';
|
142 | }
|
143 |
|
144 | return openTag;
|
145 | }
|
146 |
|
147 | var closeTag = '</' + node.name + '>';
|
148 |
|
149 | if (options['block-tags'].indexOf(node.name) > -1) {
|
150 | openTag = '\n' + openTag + '\n';
|
151 | closeTag = '\n' + closeTag + '\n';
|
152 | }
|
153 |
|
154 | if (!node.children.length) {
|
155 | return openTag + closeTag;
|
156 | }
|
157 |
|
158 | return openTag + render(node.children) + closeTag;
|
159 | }
|
160 |
|
161 | function render(nodes) {
|
162 | var html = '';
|
163 |
|
164 | nodes.forEach(function (node) {
|
165 | if (node.type == 'root') {
|
166 | html += render(node.children);
|
167 | return;
|
168 | }
|
169 |
|
170 | if (node.type == 'text') {
|
171 | html += renderText(node);
|
172 | return;
|
173 | }
|
174 |
|
175 | if (node.type == 'comment') {
|
176 | html += renderComment(node);
|
177 | return;
|
178 | }
|
179 |
|
180 | html += renderTag(node);
|
181 | });
|
182 |
|
183 |
|
184 | html = html.replace(/ +/g, ' ');
|
185 |
|
186 |
|
187 | html = html.replace(/ <br>/g, '<br>');
|
188 |
|
189 |
|
190 | html = html.replace(/ *\n\s*/g, '\n');
|
191 |
|
192 | return html;
|
193 | }
|
194 |
|
195 | function getIndent(indentLevel) {
|
196 | var indent = '';
|
197 |
|
198 | for (var i = 0; i < indentLevel; i++) {
|
199 | indent += options['indent'];
|
200 | }
|
201 |
|
202 | return indent;
|
203 | }
|
204 |
|
205 | function indent(html) {
|
206 | var indentLevel = 0;
|
207 |
|
208 | return html.replace(/.*\n/g, function (line) {
|
209 | var openTags = [],
|
210 | tagRegEx = /<\/?(\w+).*?>/g,
|
211 | tag,
|
212 | tagName,
|
213 | result;
|
214 |
|
215 | while (result = tagRegEx.exec(line)) {
|
216 | tag = result[0];
|
217 | tagName = result[1];
|
218 |
|
219 | if (options['empty-tags'].indexOf(tagName) > -1) {
|
220 | continue;
|
221 | }
|
222 |
|
223 | if (tag.indexOf('</') == -1) {
|
224 | openTags.push(tag);
|
225 | indentLevel++;
|
226 | } else {
|
227 | openTags.pop();
|
228 | indentLevel--;
|
229 | }
|
230 | }
|
231 |
|
232 | if (openTags.length) {
|
233 | return getIndent(indentLevel - openTags.length)
|
234 | + line.replace(openTags[0] + ' ', openTags[0] + '\n' + getIndent(indentLevel));
|
235 | }
|
236 |
|
237 | return getIndent(indentLevel) + line;
|
238 | });
|
239 | }
|
240 |
|
241 | function clean(html, opt, callback) {
|
242 | if (typeof opt == 'function') {
|
243 | callback = opt;
|
244 | opt = null;
|
245 | }
|
246 |
|
247 | setup(opt);
|
248 |
|
249 | var handler = new htmlparser.DomHandler(function (err, dom) {
|
250 | if (err) {
|
251 | throw err;
|
252 | }
|
253 |
|
254 | var html = render(dom);
|
255 | html = indent(html).trim();
|
256 |
|
257 | callback(html);
|
258 | });
|
259 |
|
260 | var parser = new htmlparser.Parser(handler);
|
261 | parser.write(html);
|
262 | parser.done();
|
263 | }
|
264 |
|
265 | module.exports = {
|
266 | clean: clean
|
267 | };
|