UNPKG

5.95 kBJavaScriptView Raw
1/**
2 * Simple HTML Parser
3 *
4 * @author Zongmin Lei<leizongmin@gmail.com>
5 */
6
7var _ = require("./util");
8
9/**
10 * get tag name
11 *
12 * @param {String} html e.g. '<a hef="#">'
13 * @return {String}
14 */
15function getTagName(html) {
16 var i = _.spaceIndex(html);
17 var tagName;
18 if (i === -1) {
19 tagName = html.slice(1, -1);
20 } else {
21 tagName = html.slice(1, i + 1);
22 }
23 tagName = _.trim(tagName).toLowerCase();
24 if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
25 if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
26 return tagName;
27}
28
29/**
30 * is close tag?
31 *
32 * @param {String} html 如:'<a hef="#">'
33 * @return {Boolean}
34 */
35function isClosing(html) {
36 return html.slice(0, 2) === "</";
37}
38
39/**
40 * parse input html and returns processed html
41 *
42 * @param {String} html
43 * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
44 * @param {Function} escapeHtml
45 * @return {String}
46 */
47function parseTag(html, onTag, escapeHtml) {
48 "use strict";
49
50 var rethtml = "";
51 var lastPos = 0;
52 var tagStart = false;
53 var quoteStart = false;
54 var currentPos = 0;
55 var len = html.length;
56 var currentTagName = "";
57 var currentHtml = "";
58
59 chariterator: for (currentPos = 0; currentPos < len; currentPos++) {
60 var c = html.charAt(currentPos);
61 if (tagStart === false) {
62 if (c === "<") {
63 tagStart = currentPos;
64 continue;
65 }
66 } else {
67 if (quoteStart === false) {
68 if (c === "<") {
69 rethtml += escapeHtml(html.slice(lastPos, currentPos));
70 tagStart = currentPos;
71 lastPos = currentPos;
72 continue;
73 }
74 if (c === ">") {
75 rethtml += escapeHtml(html.slice(lastPos, tagStart));
76 currentHtml = html.slice(tagStart, currentPos + 1);
77 currentTagName = getTagName(currentHtml);
78 rethtml += onTag(
79 tagStart,
80 rethtml.length,
81 currentTagName,
82 currentHtml,
83 isClosing(currentHtml)
84 );
85 lastPos = currentPos + 1;
86 tagStart = false;
87 continue;
88 }
89 if (c === '"' || c === "'") {
90 var i = 1;
91 var ic = html.charAt(currentPos - i);
92
93 while (ic.trim() === "" || ic === "=") {
94 if (ic === "=") {
95 quoteStart = c;
96 continue chariterator;
97 }
98 ic = html.charAt(currentPos - ++i);
99 }
100 }
101 } else {
102 if (c === quoteStart) {
103 quoteStart = false;
104 continue;
105 }
106 }
107 }
108 }
109 if (lastPos < html.length) {
110 rethtml += escapeHtml(html.substr(lastPos));
111 }
112
113 return rethtml;
114}
115
116var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9\\_:.-]/gim;
117
118/**
119 * parse input attributes and returns processed attributes
120 *
121 * @param {String} html e.g. `href="#" target="_blank"`
122 * @param {Function} onAttr e.g. `function (name, value)`
123 * @return {String}
124 */
125function parseAttr(html, onAttr) {
126 "use strict";
127
128 var lastPos = 0;
129 var lastMarkPos = 0;
130 var retAttrs = [];
131 var tmpName = false;
132 var len = html.length;
133
134 function addAttr(name, value) {
135 name = _.trim(name);
136 name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
137 if (name.length < 1) return;
138 var ret = onAttr(name, value || "");
139 if (ret) retAttrs.push(ret);
140 }
141
142 // 逐个分析字符
143 for (var i = 0; i < len; i++) {
144 var c = html.charAt(i);
145 var v, j;
146 if (tmpName === false && c === "=") {
147 tmpName = html.slice(lastPos, i);
148 lastPos = i + 1;
149 lastMarkPos = html.charAt(lastPos) === '"' || html.charAt(lastPos) === "'" ? lastPos : findNextQuotationMark(html, i + 1);
150 continue;
151 }
152 if (tmpName !== false) {
153 if (
154 i === lastMarkPos
155 ) {
156 j = html.indexOf(c, i + 1);
157 if (j === -1) {
158 break;
159 } else {
160 v = _.trim(html.slice(lastMarkPos + 1, j));
161 addAttr(tmpName, v);
162 tmpName = false;
163 i = j;
164 lastPos = i + 1;
165 continue;
166 }
167 }
168 }
169 if (/\s|\n|\t/.test(c)) {
170 html = html.replace(/\s|\n|\t/g, " ");
171 if (tmpName === false) {
172 j = findNextEqual(html, i);
173 if (j === -1) {
174 v = _.trim(html.slice(lastPos, i));
175 addAttr(v);
176 tmpName = false;
177 lastPos = i + 1;
178 continue;
179 } else {
180 i = j - 1;
181 continue;
182 }
183 } else {
184 j = findBeforeEqual(html, i - 1);
185 if (j === -1) {
186 v = _.trim(html.slice(lastPos, i));
187 v = stripQuoteWrap(v);
188 addAttr(tmpName, v);
189 tmpName = false;
190 lastPos = i + 1;
191 continue;
192 } else {
193 continue;
194 }
195 }
196 }
197 }
198
199 if (lastPos < html.length) {
200 if (tmpName === false) {
201 addAttr(html.slice(lastPos));
202 } else {
203 addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
204 }
205 }
206
207 return _.trim(retAttrs.join(" "));
208}
209
210function findNextEqual(str, i) {
211 for (; i < str.length; i++) {
212 var c = str[i];
213 if (c === " ") continue;
214 if (c === "=") return i;
215 return -1;
216 }
217}
218
219function findNextQuotationMark(str, i) {
220 for (; i < str.length; i++) {
221 var c = str[i];
222 if (c === " ") continue;
223 if (c === "'" || c === '"') return i;
224 return -1;
225 }
226}
227
228function findBeforeEqual(str, i) {
229 for (; i > 0; i--) {
230 var c = str[i];
231 if (c === " ") continue;
232 if (c === "=") return i;
233 return -1;
234 }
235}
236
237function isQuoteWrapString(text) {
238 if (
239 (text[0] === '"' && text[text.length - 1] === '"') ||
240 (text[0] === "'" && text[text.length - 1] === "'")
241 ) {
242 return true;
243 } else {
244 return false;
245 }
246}
247
248function stripQuoteWrap(text) {
249 if (isQuoteWrapString(text)) {
250 return text.substr(1, text.length - 2);
251 } else {
252 return text;
253 }
254}
255
256exports.parseTag = parseTag;
257exports.parseAttr = parseAttr;