UNPKG

9.39 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 this._options = options || defaultOpts;
5 this._cbs = cbs || defaultCbs;
6 this._buffer = "";
7 this._tagSep = "";
8 this._stack = [];
9 this._wroteSpecial = false;
10 this._contentFlags = 0;
11 this._done = false;
12 this._running = true; //false if paused
13}
14
15//Regular expressions used for cleaning up and parsing (stateless)
16var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
17 _reTail = /\s|\/|$/;
18
19var defaultOpts = {
20 xmlMode: false, //Special behavior for script/style tags by default
21 lowerCaseTags: false //call .toLowerCase for each tag name
22};
23
24var defaultCbs = {
25 /*
26 This is just a plain object
27 so that the parser doesn't
28 throw if no arguments were
29 provided.
30 */
31 /*
32 oncdataend,
33 oncdatastart,
34 onclosetag,
35 oncomment,
36 oncommentend,
37 onerror,
38 onopentag,
39 onprocessinginstruction,
40 onreset,
41 ontext
42 */
43};
44
45//Parses a complete HTML and pushes it to the handler
46Parser.prototype.parseComplete = function(data){
47 this.reset();
48 this.write(data);
49 this.end();
50};
51
52//Parses a piece of an HTML document
53Parser.prototype.parseChunk =
54Parser.prototype.write = function(data){
55 if(this._done) this._handleError("Attempted to parse chunk after parsing already done");
56 this._buffer += data; //FIXME: this can be a bottleneck
57 if(this._running) this._parseTags();
58};
59
60//Tells the parser that the HTML being parsed is complete
61Parser.prototype.done =
62Parser.prototype.end = function(chunk){
63 if(this._done) return;
64
65 if(chunk) this.write(chunk);
66 this._done = true;
67
68 if(this._running) this._finishParsing();
69};
70
71Parser.prototype._finishParsing = function(){
72 //Parse the buffer to its end
73 if(this._buffer) this._parseTags(true);
74
75 if(this._cbs.onclosetag){
76 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
77 }
78
79 if(this._cbs.onend) this._cbs.onend();
80};
81
82Parser.prototype.pause = function(){
83 if(!this._done) this._running = false;
84};
85
86Parser.prototype.resume = function(){
87 if(this._running) return;
88 this._running = true;
89 this._parseTags();
90 if(this._done) this._finishParsing();
91};
92
93//Resets the parser to a blank state, ready to parse a new HTML document
94Parser.prototype.reset = function(){
95 Parser.call(this);
96 if(this._cbs.onreset) this._cbs.onreset();
97};
98
99//Extracts the base tag name from the data value of an element
100Parser.prototype._parseTagName = function(data){
101 var match = data.substr(0, data.search(_reTail));
102 if(!this._options.lowerCaseTags) return match;
103 return match.toLowerCase();
104};
105
106//Special tags that are treated differently
107var SpecialTags = {};
108//SpecialTags[ElementType.Tag] = 0x0;
109SpecialTags[ElementType.Style] = 0x1; //2^0
110SpecialTags[ElementType.Script] = 0x2; //2^1
111SpecialTags[ElementType.Comment] = 0x4; //2^2
112SpecialTags[ElementType.CDATA] = 0x8; //2^3
113
114var TagValues = {
115 style: 1,
116 script: 2
117};
118
119//Parses through HTML text and returns an array of found elements
120Parser.prototype._parseTags = function(force){
121 var current = 0,
122 opening = this._buffer.indexOf("<"),
123 closing = this._buffer.indexOf(">"),
124 next, rawData, elementData, lastTagSep;
125
126 //if force is true, parse everything
127 if(force) opening = Infinity;
128
129 //opening !== closing is just false if both are -1
130 while(opening !== closing && this._running){
131 lastTagSep = this._tagSep;
132
133 if((opening !== -1 && opening < closing) || closing === -1){
134 next = opening;
135 this._tagSep = "<";
136 opening = this._buffer.indexOf("<", next + 1);
137 }
138 else{
139 next = closing;
140 this._tagSep = ">";
141 closing = this._buffer.indexOf(">", next + 1);
142 }
143 rawData = this._buffer.substring(current, next); //The next chunk of data to parse
144
145 //set elements for next run
146 current = next + 1;
147
148 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
149 // We're inside a CDATA section
150 this._writeCDATA(rawData);
151
152 }
153 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
154 //We're in a comment tag
155 this._writeComment(rawData);
156 }
157 else if(lastTagSep === "<"){
158 elementData = rawData.trimLeft();
159 if(elementData.charAt(0) === "/"){
160 //elementData = elementData.substr(1).trim();
161 elementData = this._parseTagName(elementData.substr(1));
162 if(this._contentFlags !== 0){
163 //if it's a closing tag, remove the flag
164 if(this._contentFlags & TagValues[elementData]){
165 //remove the flag
166 this._contentFlags ^= TagValues[elementData];
167 }
168 else {
169 this._writeSpecial(rawData, lastTagSep);
170 continue;
171 }
172 }
173 this._processCloseTag(elementData);
174 }
175 else if(elementData.charAt(0) === "!"){
176 if(elementData.substr(1, 2) === "--"){
177 //This tag is a comment
178 this._contentFlags |= SpecialTags[ElementType.Comment];
179 this._writeComment(rawData.substr(3));
180 }
181 else if(elementData.substr(1, 7) === "[CDATA["){
182 this._contentFlags |= SpecialTags[ElementType.CDATA];
183 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
184 this._writeCDATA(elementData.substr(8));
185 }
186 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
187 //TODO: This isn't a processing instruction, needs a new name
188 else if(this._cbs.onprocessinginstruction){
189 this._cbs.onprocessinginstruction(
190 "!" + this._parseTagName(elementData.substr(1)),
191 elementData
192 );
193 }
194 }
195 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
196 else if(elementData.charAt(0) === "?"){
197 if(this._cbs.onprocessinginstruction){
198 this._cbs.onprocessinginstruction(
199 "?" + this._parseTagName(elementData.substr(1)),
200 elementData
201 );
202 }
203 }
204 else this._processOpenTag(elementData);
205 }
206 else{
207 if(this._contentFlags !== 0){
208 this._writeSpecial(rawData, ">");
209 }
210 else if(rawData !== "" && this._cbs.ontext){
211 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
212 this._cbs.ontext(rawData);
213 }
214 }
215 }
216
217 this._buffer = this._buffer.substr(current);
218};
219
220Parser.prototype._writeCDATA = function(data){
221 if(this._tagSep === ">" && data.substr(-2) === "]]"){
222 // CDATA ends
223 if(data.length !== 2 && this._cbs.ontext){
224 this._cbs.ontext(data.slice(0,-2));
225 }
226 this._contentFlags ^= SpecialTags[ElementType.CDATA];
227 if(this._cbs.oncdataend) this._cbs.oncdataend();
228 }
229 else if(this._cbs.ontext) this._cbs.ontext(data + this._tagSep);
230};
231
232Parser.prototype._writeComment = function(rawData){
233 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
234 //remove the written flag (also removes the comment flag)
235 this._contentFlags ^= SpecialTags[ElementType.Comment];
236 this._wroteSpecial = false;
237 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
238 if(this._cbs.oncommentend) this._cbs.oncommentend();
239 }
240 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
241};
242
243Parser.prototype._writeSpecial = function(rawData, lastTagSep){
244 //if the previous element is text, append the last tag sep to element
245 if(this._wroteSpecial){
246 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
247 }
248 else{ //The previous element was not text
249 this._wroteSpecial = true;
250 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
251 }
252};
253
254var emptyTags = {
255 __proto__: null,
256 area: true,
257 base: true,
258 basefont: true,
259 br: true,
260 col: true,
261 frame: true,
262 hr: true,
263 img: true,
264 input: true,
265 isindex: true,
266 link: true,
267 meta: true,
268 param: true,
269 embed: true
270};
271
272Parser.prototype._processCloseTag = function(name){
273 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
274 var pos = this._stack.lastIndexOf(name);
275 if(pos !== -1)
276 if(this._cbs.onclosetag){
277 pos = this._stack.length - pos;
278 while(pos--) this._cbs.onclosetag(this._stack.pop());
279 }
280 else this._stack.splice(pos);
281 }
282 //many browsers (eg. Safari, Chrome) convert </br> to <br>
283 else if(name === "br" && !this._options.xmlMode)
284 this._processOpenTag(name + "/");
285};
286
287Parser.prototype._parseAttributes = function(data){
288 for(var match; match = _reAttrib.exec(data);){
289 this._cbs.onattribute(match[1], match[2] || match[3] || match[4] || "");
290 }
291};
292
293//parses the attribute string
294var parseAttributes = function(data){
295 var attrs = {};
296 for(var match; match = _reAttrib.exec(data);){
297 attrs[match[1]] = match[2] || match[3] || match[4] || "";
298 }
299 return attrs;
300};
301
302Parser.prototype._processOpenTag = function(data){
303 var name = this._parseTagName(data),
304 type = ElementType.Tag;
305
306 if(this._options.xmlMode){ /*do nothing*/ }
307 else if(name === "script") type = ElementType.Script;
308 else if(name === "style") type = ElementType.Style;
309
310 if(this._cbs.onopentagname) this._cbs.onopentagname(name);
311 if(this._cbs.onopentag) this._cbs.onopentag(name, parseAttributes(data));
312 if(this._cbs.onattribute) this._parseAttributes(data);
313
314 //If tag self-terminates, add an explicit, separate closing tag
315 if(data.substr(-1) === "/" || (name in emptyTags && !this._options.xmlMode)){
316 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
317 } else {
318 if(type !== ElementType.Tag){
319 this._contentFlags |= SpecialTags[type];
320 this._wroteSpecial = false;
321 }
322 this._stack.push(name);
323 }
324};
325
326Parser.prototype._handleError = function(error){
327 error = new Error(error);
328 if(this._cbs.onerror) this._cbs.onerror(error);
329 else throw error;
330};
331
332module.exports = Parser;
\No newline at end of file