UNPKG

9.35 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 if(options) this._options = options;
5 if(cbs) this._cbs = cbs;
6
7 this._buffer = "";
8 this._tagSep = "";
9 this._stack = [];
10 this._wroteSpecial = false;
11 this._contentFlags = 0;
12 this._done = false;
13 this._running = true; //false if paused
14}
15
16//Regular expressions used for cleaning up and parsing (stateless)
17var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
18 _reTail = /\s|\/|$/;
19
20Parser.prototype._options = {
21 xmlMode: false, //Special behavior for script/style tags by default
22 lowerCaseTags: false //call .toLowerCase for each tag name
23};
24
25Parser.prototype._cbs = {
26 /*
27 This is just a plain object
28 so that the parser doesn't
29 throw if no arguments were
30 provided.
31 */
32 /*
33 oncdataend,
34 oncdatastart,
35 onclosetag,
36 oncomment,
37 oncommentend,
38 onerror,
39 onopentag,
40 onprocessinginstruction,
41 onreset,
42 ontext
43 */
44};
45
46//Parses a complete HTML and pushes it to the handler
47Parser.prototype.parseComplete = function(data){
48 this.reset();
49 this.write(data);
50 this.end();
51};
52
53//Parses a piece of an HTML document
54Parser.prototype.parseChunk =
55Parser.prototype.write = function(data){
56 if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done"));
57 this._buffer += data; //FIXME: this can be a bottleneck
58 if(this._running) this._parseTags();
59};
60
61//Tells the parser that the HTML being parsed is complete
62Parser.prototype.done =
63Parser.prototype.end = function(chunk){
64 if(this._done) return;
65
66 if(chunk) this.write(chunk);
67 this._done = true;
68
69 if(this._running) this._finishParsing();
70};
71
72Parser.prototype._finishParsing = function(){
73 //Parse the buffer to its end
74 if(this._buffer) this._parseTags(true);
75
76 if(this._cbs.onclosetag){
77 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
78 }
79
80 if(this._cbs.onend) this._cbs.onend();
81};
82
83Parser.prototype.pause = function(){
84 if(!this._done) this._running = false;
85};
86
87Parser.prototype.resume = function(){
88 if(this._running) return;
89 this._running = true;
90 this._parseTags();
91 if(this._done) this._finishParsing();
92};
93
94//Resets the parser to a blank state, ready to parse a new HTML document
95Parser.prototype.reset = function(){
96 Parser.call(this);
97 if(this._cbs.onreset) this._cbs.onreset();
98};
99
100//Extracts the base tag name from the data value of an element
101Parser.prototype._parseTagName = function(data){
102 var match = data.substr(0, data.search(_reTail));
103 if(!this._options.lowerCaseTags) return match;
104 return match.toLowerCase();
105};
106
107//Special tags that are treated differently
108var SpecialTags = {};
109//SpecialTags[ElementType.Tag] = 0;
110SpecialTags[ElementType.Style] = 1; //2^0
111SpecialTags[ElementType.Script] = 2; //2^1
112SpecialTags[ElementType.Comment] = 4; //2^2
113SpecialTags[ElementType.CDATA] = 8; //2^3
114
115var TagValues = {
116 style: 1,
117 script: 2
118};
119
120//Parses through HTML text and returns an array of found elements
121Parser.prototype._parseTags = function(force){
122 var buffer = this._buffer, current = 0;
123
124 var next, rawData, elementData, lastTagSep;
125
126 var opening = buffer.indexOf("<"), closing = buffer.indexOf(">");
127
128 //if force is true, parse everything
129 if(force) opening = Infinity;
130
131 //opening !== closing is just false if both are -1
132 while(opening !== closing && this._running){
133 lastTagSep = this._tagSep;
134
135 if((opening !== -1 && opening < closing) || closing === -1){
136 next = opening;
137 this._tagSep = "<";
138 opening = buffer.indexOf("<", next + 1);
139 }
140 else{
141 next = closing;
142 this._tagSep = ">";
143 closing = buffer.indexOf(">", next + 1);
144 }
145 rawData = buffer.substring(current, next); //The next chunk of data to parse
146
147 //set elements for next run
148 current = next + 1;
149
150 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
151 // We're inside a CDATA section
152 this._writeCDATA(rawData);
153
154 }
155 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
156 //We're in a comment tag
157 this._writeComment(rawData);
158 }
159 else if(lastTagSep === "<"){
160 elementData = rawData.trimLeft();
161 if(elementData.charAt(0) === "/"){
162 //elementData = elementData.substr(1).trim();
163 elementData = this._parseTagName(elementData.substr(1));
164 if(this._contentFlags !== 0){
165 //if it's a closing tag, remove the flag
166 if(this._contentFlags >= TagValues[elementData]){
167 //remove the flag
168 this._contentFlags -= TagValues[elementData];
169 }
170 else {
171 this._writeSpecial(rawData, lastTagSep);
172 continue;
173 }
174 }
175 this._processCloseTag(elementData);
176 }
177 else if(elementData.charAt(0) === "!"){
178 if(elementData.substr(1, 2) === "--"){
179 //This tag is a comment
180 this._contentFlags += SpecialTags[ElementType.Comment];
181 this._writeComment(rawData.substr(3));
182 }
183 else if(elementData.substr(1, 7) === "[CDATA["){
184 this._contentFlags += SpecialTags[ElementType.CDATA];
185 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
186 this._writeCDATA(elementData.substr(8));
187 }
188 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
189 //TODO: This isn't a processing instruction, needs a new name
190 else if(this._cbs.onprocessinginstruction){
191 this._cbs.onprocessinginstruction(
192 "!" + this._parseTagName(elementData.substr(1)),
193 elementData
194 );
195 }
196 }
197 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
198 else if(elementData.charAt(0) === "?"){
199 if(this._cbs.onprocessinginstruction){
200 this._cbs.onprocessinginstruction(
201 "?" + this._parseTagName(elementData.substr(1)),
202 elementData
203 );
204 }
205 }
206 else this._processOpenTag(elementData);
207 }
208 else{
209 if(this._contentFlags !== 0){
210 this._writeSpecial(rawData, ">");
211 }
212 else if(rawData !== "" && this._cbs.ontext){
213 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
214 this._cbs.ontext(rawData);
215 }
216 }
217 }
218
219 this._buffer = buffer.substring(current);
220};
221
222Parser.prototype._writeCDATA = function(data){
223 if(this._tagSep === ">" && data.substr(-2) === "]]"){
224 // CDATA ends
225 if(data.length !== 2 && this._cbs.ontext){
226 this._cbs.ontext(data.slice(0,-2));
227 }
228 this._contentFlags -= SpecialTags[ElementType.CDATA];
229 if(this._cbs.oncdataend) this._cbs.oncdataend();
230 }
231 else if(this._cbs.ontext) this._cbs.ontext(data + this._tagSep);
232};
233
234Parser.prototype._writeComment = function(rawData){
235 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
236 //remove the written flag (also removes the comment flag)
237 this._contentFlags -= SpecialTags[ElementType.Comment];
238 this._wroteSpecial = false;
239 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
240 if(this._cbs.oncommentend) this._cbs.oncommentend();
241 }
242 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
243};
244
245Parser.prototype._writeSpecial = function(rawData, lastTagSep){
246 //if the previous element is text, append the last tag sep to element
247 if(this._wroteSpecial){
248 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
249 }
250 else{ //The previous element was not text
251 this._wroteSpecial = true;
252 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
253 }
254};
255
256var emptyTags = {
257 __proto__: null,
258 area: true,
259 base: true,
260 basefont: true,
261 br: true,
262 col: true,
263 frame: true,
264 hr: true,
265 img: true,
266 input: true,
267 isindex: true,
268 link: true,
269 meta: true,
270 param: true,
271 embed: true
272};
273
274Parser.prototype._processCloseTag = function(name){
275 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
276 var pos = this._stack.lastIndexOf(name);
277 if(pos !== -1)
278 if(this._cbs.onclosetag){
279 pos = this._stack.length - pos;
280 while(pos--) this._cbs.onclosetag(this._stack.pop());
281 }
282 else this._stack.splice(pos);
283 }
284 //many browsers (eg. Safari, Chrome) convert </br> to <br>
285 else if(name === "br" && !this._options.xmlMode)
286 this._processOpenTag(name + "/");
287};
288
289Parser.prototype._parseAttributes = function(data){
290 for(var match; match = _reAttrib.exec(data);){
291 this._cbs.onattribute(match[1], match[2] || match[3] || match[4] || "");
292 }
293};
294
295//parses the attribute string
296var parseAttributes = function(data){
297 var attrs = {};
298 for(var match; match = _reAttrib.exec(data);){
299 attrs[match[1]] = match[2] || match[3] || match[4] || "";
300 }
301 return attrs;
302};
303
304Parser.prototype._processOpenTag = function(data){
305 var name = this._parseTagName(data),
306 type = ElementType.Tag;
307
308 if(this._options.xmlMode){ /*do nothing*/ }
309 else if(name === "script") type = ElementType.Script;
310 else if(name === "style") type = ElementType.Style;
311
312 if(this._cbs.onopentagname) this._cbs.onopentagname(name);
313 if(this._cbs.onopentag) this._cbs.onopentag(name, parseAttributes(data));
314 if(this._cbs.onattribute) this._parseAttributes(data);
315
316 //If tag self-terminates, add an explicit, separate closing tag
317 if(data.substr(-1) === "/" || (name in emptyTags && !this._options.xmlMode)){
318 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
319 } else {
320 if(type !== ElementType.Tag){
321 this._contentFlags += SpecialTags[type];
322 this._wroteSpecial = false;
323 }
324 this._stack.push(name);
325 }
326};
327
328Parser.prototype._handleError = function(error){
329 if(this._cbs.onerror)
330 this._cbs.onerror(error);
331 else throw error;
332};
333
334module.exports = Parser;
\No newline at end of file