UNPKG

5.45 kBJavaScriptView Raw
1/**
2 * Simple HTML Parser
3 *
4 * @author Zongmin Lei<leizongmin@gmail.com>
5 */
6
7var _ = require("./util");
8
9/**
10 * get tag name
11 *
12 * @param {String} html e.g. '<a hef="#">'
13 * @return {String}
14 */
15function getTagName(html) {
16 var i = _.spaceIndex(html);
17 if (i === -1) {
18 var tagName = html.slice(1, -1);
19 } else {
20 var tagName = html.slice(1, i + 1);
21 }
22 tagName = _.trim(tagName).toLowerCase();
23 if (tagName.slice(0, 1) === "/") tagName = tagName.slice(1);
24 if (tagName.slice(-1) === "/") tagName = tagName.slice(0, -1);
25 return tagName;
26}
27
28/**
29 * is close tag?
30 *
31 * @param {String} html 如:'<a hef="#">'
32 * @return {Boolean}
33 */
34function isClosing(html) {
35 return html.slice(0, 2) === "</";
36}
37
38/**
39 * parse input html and returns processed html
40 *
41 * @param {String} html
42 * @param {Function} onTag e.g. function (sourcePosition, position, tag, html, isClosing)
43 * @param {Function} escapeHtml
44 * @return {String}
45 */
46function parseTag(html, onTag, escapeHtml) {
47 "user strict";
48
49 var rethtml = "";
50 var lastPos = 0;
51 var tagStart = false;
52 var quoteStart = false;
53 var currentPos = 0;
54 var len = html.length;
55 var currentTagName = "";
56 var currentHtml = "";
57
58 for (currentPos = 0; currentPos < len; currentPos++) {
59 var c = html.charAt(currentPos);
60 if (tagStart === false) {
61 if (c === "<") {
62 tagStart = currentPos;
63 continue;
64 }
65 } else {
66 if (quoteStart === false) {
67 if (c === "<") {
68 rethtml += escapeHtml(html.slice(lastPos, currentPos));
69 tagStart = currentPos;
70 lastPos = currentPos;
71 continue;
72 }
73 if (c === ">") {
74 rethtml += escapeHtml(html.slice(lastPos, tagStart));
75 currentHtml = html.slice(tagStart, currentPos + 1);
76 currentTagName = getTagName(currentHtml);
77 rethtml += onTag(
78 tagStart,
79 rethtml.length,
80 currentTagName,
81 currentHtml,
82 isClosing(currentHtml)
83 );
84 lastPos = currentPos + 1;
85 tagStart = false;
86 continue;
87 }
88 if ((c === '"' || c === "'") && html.charAt(currentPos - 1) === "=") {
89 quoteStart = c;
90 continue;
91 }
92 } else {
93 if (c === quoteStart) {
94 quoteStart = false;
95 continue;
96 }
97 }
98 }
99 }
100 if (lastPos < html.length) {
101 rethtml += escapeHtml(html.substr(lastPos));
102 }
103
104 return rethtml;
105}
106
107var REGEXP_ILLEGAL_ATTR_NAME = /[^a-zA-Z0-9_:\.\-]/gim;
108
109/**
110 * parse input attributes and returns processed attributes
111 *
112 * @param {String} html e.g. `href="#" target="_blank"`
113 * @param {Function} onAttr e.g. `function (name, value)`
114 * @return {String}
115 */
116function parseAttr(html, onAttr) {
117 "user strict";
118
119 var lastPos = 0;
120 var retAttrs = [];
121 var tmpName = false;
122 var len = html.length;
123
124 function addAttr(name, value) {
125 name = _.trim(name);
126 name = name.replace(REGEXP_ILLEGAL_ATTR_NAME, "").toLowerCase();
127 if (name.length < 1) return;
128 var ret = onAttr(name, value || "");
129 if (ret) retAttrs.push(ret);
130 }
131
132 // 逐个分析字符
133 for (var i = 0; i < len; i++) {
134 var c = html.charAt(i);
135 var v, j;
136 if (tmpName === false && c === "=") {
137 tmpName = html.slice(lastPos, i);
138 lastPos = i + 1;
139 continue;
140 }
141 if (tmpName !== false) {
142 if (
143 i === lastPos &&
144 (c === '"' || c === "'") &&
145 html.charAt(i - 1) === "="
146 ) {
147 j = html.indexOf(c, i + 1);
148 if (j === -1) {
149 break;
150 } else {
151 v = _.trim(html.slice(lastPos + 1, j));
152 addAttr(tmpName, v);
153 tmpName = false;
154 i = j;
155 lastPos = i + 1;
156 continue;
157 }
158 }
159 }
160 if (/\s|\n|\t/.test(c)) {
161 html = html.replace(/\s|\n|\t/g, " ");
162 if (tmpName === false) {
163 j = findNextEqual(html, i);
164 if (j === -1) {
165 v = _.trim(html.slice(lastPos, i));
166 addAttr(v);
167 tmpName = false;
168 lastPos = i + 1;
169 continue;
170 } else {
171 i = j - 1;
172 continue;
173 }
174 } else {
175 j = findBeforeEqual(html, i - 1);
176 if (j === -1) {
177 v = _.trim(html.slice(lastPos, i));
178 v = stripQuoteWrap(v);
179 addAttr(tmpName, v);
180 tmpName = false;
181 lastPos = i + 1;
182 continue;
183 } else {
184 continue;
185 }
186 }
187 }
188 }
189
190 if (lastPos < html.length) {
191 if (tmpName === false) {
192 addAttr(html.slice(lastPos));
193 } else {
194 addAttr(tmpName, stripQuoteWrap(_.trim(html.slice(lastPos))));
195 }
196 }
197
198 return _.trim(retAttrs.join(" "));
199}
200
201function findNextEqual(str, i) {
202 for (; i < str.length; i++) {
203 var c = str[i];
204 if (c === " ") continue;
205 if (c === "=") return i;
206 return -1;
207 }
208}
209
210function findBeforeEqual(str, i) {
211 for (; i > 0; i--) {
212 var c = str[i];
213 if (c === " ") continue;
214 if (c === "=") return i;
215 return -1;
216 }
217}
218
219function isQuoteWrapString(text) {
220 if (
221 (text[0] === '"' && text[text.length - 1] === '"') ||
222 (text[0] === "'" && text[text.length - 1] === "'")
223 ) {
224 return true;
225 } else {
226 return false;
227 }
228}
229
230function stripQuoteWrap(text) {
231 if (isQuoteWrapString(text)) {
232 return text.substr(1, text.length - 2);
233 } else {
234 return text;
235 }
236}
237
238exports.parseTag = parseTag;
239exports.parseAttr = parseAttr;