UNPKG

9.63 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 this._options = options || defaultOpts;
5 this._cbs = cbs || defaultCbs;
6 this._buffer = "";
7 this._tagSep = "";
8 this._stack = [];
9 this._wroteSpecial = false;
10 this._contentFlags = 0;
11 this._done = false;
12 this._running = true; //false if paused
13}
14
15//Regular expressions used for cleaning up and parsing (stateless)
16var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
17 _reTail = /\s|\/|$/;
18
19var defaultOpts = {
20 xmlMode: false, //Special behavior for script/style tags by default
21 lowerCaseAttributeNames: false, //call .toLowerCase for each attribute name
22 lowerCaseTags: false //call .toLowerCase for each tag name
23};
24
25var defaultCbs = {
26 /*
27 This is just a plain object
28 so that the parser doesn't
29 throw if no arguments were
30 provided.
31 */
32 /*
33 oncdataend,
34 oncdatastart,
35 onclosetag,
36 oncomment,
37 oncommentend,
38 onerror,
39 onopentag,
40 onprocessinginstruction,
41 onreset,
42 ontext
43 */
44};
45
46//Parses a complete HTML and pushes it to the handler
47Parser.prototype.parseComplete = function(data){
48 this.reset();
49 this.end(data);
50};
51
52//Parses a piece of an HTML document
53Parser.prototype.parseChunk =
54Parser.prototype.write = function(data){
55 if(this._done) this._handleError("Attempted to parse chunk after parsing already done");
56 this._buffer += data; //FIXME: this can be a bottleneck
57 if(this._running) this._parseTags();
58};
59
60//Tells the parser that the HTML being parsed is complete
61Parser.prototype.done =
62Parser.prototype.end = function(chunk){
63 if(this._done) return;
64
65 if(chunk) this.write(chunk);
66 this._done = true;
67
68 if(this._running) this._finishParsing();
69};
70
71Parser.prototype._finishParsing = function(){
72 //Parse the buffer to its end
73 if(this._buffer) this._parseTags(true);
74
75 if(this._cbs.onclosetag){
76 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
77 }
78
79 if(this._cbs.onend) this._cbs.onend();
80};
81
82Parser.prototype.pause = function(){
83 if(!this._done) this._running = false;
84};
85
86Parser.prototype.resume = function(){
87 if(this._running) return;
88 this._running = true;
89 this._parseTags();
90 if(this._done) this._finishParsing();
91};
92
93//Resets the parser to a blank state, ready to parse a new HTML document
94Parser.prototype.reset = function(){
95 Parser.call(this, this._cbs, this._options);
96 if(this._cbs.onreset) this._cbs.onreset();
97};
98
99//Extracts the base tag name from the data value of an element
100Parser.prototype._parseTagName = function(data){
101 var match = data.substr(0, data.search(_reTail));
102 if(!this._options.lowerCaseTags) return match;
103 return match.toLowerCase();
104};
105
106//Special tags that are treated differently
107var SpecialTags = {};
108//SpecialTags[ElementType.Tag] = 0x0;
109SpecialTags[ElementType.Style] = 0x1; //2^0
110SpecialTags[ElementType.Script] = 0x2; //2^1
111SpecialTags[ElementType.Comment] = 0x4; //2^2
112SpecialTags[ElementType.CDATA] = 0x8; //2^3
113
114var TagValues = {
115 style: 1,
116 script: 2
117};
118
119//Parses through HTML text and returns an array of found elements
120Parser.prototype._parseTags = function(force){
121 var current = 0,
122 opening = this._buffer.indexOf("<"),
123 closing = this._buffer.indexOf(">"),
124 next, rawData, elementData, lastTagSep;
125
126 //if force is true, parse everything
127 if(force) opening = Infinity;
128
129 //opening !== closing is just false if both are -1
130 while(opening !== closing && this._running){
131 lastTagSep = this._tagSep;
132
133 if((opening !== -1 && opening < closing) || closing === -1){
134 next = opening;
135 this._tagSep = "<";
136 opening = this._buffer.indexOf("<", next + 1);
137 }
138 else{
139 next = closing;
140 this._tagSep = ">";
141 closing = this._buffer.indexOf(">", next + 1);
142 }
143 rawData = this._buffer.substring(current, next); //The next chunk of data to parse
144
145 //set elements for next run
146 current = next + 1;
147
148 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
149 // We're inside a CDATA section
150 this._writeCDATA(rawData);
151
152 }
153 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
154 //We're in a comment tag
155 this._writeComment(rawData);
156 }
157 else if(lastTagSep === "<"){
158 elementData = rawData.trimLeft();
159 if(elementData.charAt(0) === "/"){
160 //elementData = elementData.substr(1).trim();
161 elementData = this._parseTagName(elementData.substr(1));
162 if(this._contentFlags !== 0){
163 //if it's a closing tag, remove the flag
164 if(this._contentFlags & TagValues[elementData]){
165 //remove the flag
166 this._contentFlags ^= TagValues[elementData];
167 } else {
168 this._writeSpecial(rawData, lastTagSep);
169 continue;
170 }
171 }
172 this._processCloseTag(elementData);
173 }
174 else if(elementData.charAt(0) === "!"){
175 if(elementData.substr(1, 7) === "[CDATA["){
176 this._contentFlags |= SpecialTags[ElementType.CDATA];
177 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
178 this._writeCDATA(elementData.substr(8));
179 }
180 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
181 else if(elementData.substr(1, 2) === "--"){
182 //This tag is a comment
183 this._contentFlags |= SpecialTags[ElementType.Comment];
184 this._writeComment(rawData.substr(3));
185 }
186 //TODO: This isn't a processing instruction, needs a new name
187 else if(this._cbs.onprocessinginstruction){
188 this._cbs.onprocessinginstruction(
189 "!" + this._parseTagName(elementData.substr(1)),
190 elementData
191 );
192 }
193 }
194 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
195 else if(elementData.charAt(0) === "?"){
196 if(this._cbs.onprocessinginstruction){
197 this._cbs.onprocessinginstruction(
198 "?" + this._parseTagName(elementData.substr(1)),
199 elementData
200 );
201 }
202 }
203 else this._processOpenTag(elementData);
204 }
205 else{
206 if(this._contentFlags !== 0){
207 this._writeSpecial(rawData, ">");
208 }
209 else if(rawData !== "" && this._cbs.ontext){
210 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
211 this._cbs.ontext(rawData);
212 }
213 }
214 }
215
216 this._buffer = this._buffer.substr(current);
217};
218
219Parser.prototype._writeCDATA = function(data){
220 if(this._tagSep === ">" && data.substr(-2) === "]]"){
221 // CDATA ends
222 if(data.length !== 2 && this._cbs.ontext){
223 this._cbs.ontext(data.slice(0,-2));
224 }
225 this._contentFlags ^= SpecialTags[ElementType.CDATA];
226 if(this._cbs.oncdataend) this._cbs.oncdataend();
227 }
228 else if(this._cbs.ontext) this._cbs.ontext(data + this._tagSep);
229};
230
231Parser.prototype._writeComment = function(rawData){
232 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
233 //remove the written flag (also removes the comment flag)
234 this._contentFlags ^= SpecialTags[ElementType.Comment];
235 this._wroteSpecial = false;
236 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
237 if(this._cbs.oncommentend) this._cbs.oncommentend();
238 }
239 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
240};
241
242Parser.prototype._writeSpecial = function(rawData, lastTagSep){
243 //if the previous element is text, append the last tag sep to element
244 if(this._wroteSpecial){
245 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
246 }
247 else{ //The previous element was not text
248 this._wroteSpecial = true;
249 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
250 }
251};
252
253var emptyTags = {
254 __proto__: null,
255 area: true,
256 base: true,
257 basefont: true,
258 br: true,
259 col: true,
260 frame: true,
261 hr: true,
262 img: true,
263 input: true,
264 isindex: true,
265 link: true,
266 meta: true,
267 param: true,
268 embed: true
269};
270
271Parser.prototype._processCloseTag = function(name){
272 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
273 var pos = this._stack.lastIndexOf(name);
274 if(pos !== -1)
275 if(this._cbs.onclosetag){
276 pos = this._stack.length - pos;
277 while(pos--) this._cbs.onclosetag(this._stack.pop());
278 }
279 else this._stack.splice(pos);
280 }
281 //many browsers (eg. Safari, Chrome) convert </br> to <br>
282 else if(name === "br" && !this._options.xmlMode){
283 this._processOpenTag(name + "/");
284 }
285};
286
287Parser.prototype._parseAttributes = function(data, lcNames){
288 for(var match; match = _reAttrib.exec(data);){
289 this._cbs.onattribute(lcNames ? match[1].toLowerCase() : match[1], match[2] || match[3] || match[4] || "");
290 }
291};
292
293//parses the attribute string
294var parseAttributes = function(data, lcNames){
295 var attrs = {};
296 for(var match; match = _reAttrib.exec(data);){
297 attrs[lcNames ? match[1].toLowerCase() : match[1]] = match[2] || match[3] || match[4] || "";
298 }
299 return attrs;
300};
301
302Parser.prototype._processOpenTag = function(data){
303 var name = this._parseTagName(data),
304 type = ElementType.Tag;
305
306 if(this._options.xmlMode){ /*do nothing*/ }
307 else if(name === "script") type = ElementType.Script;
308 else if(name === "style") type = ElementType.Style;
309
310 if(this._cbs.onopentagname) this._cbs.onopentagname(name);
311 if(this._cbs.onopentag){
312 this._cbs.onopentag(name, parseAttributes(
313 data, this._options.lowerCaseAttributeNames
314 ));
315 }
316 if(this._cbs.onattribute){
317 this._parseAttributes(data, this._options.lowerCaseAttributeNames);
318 }
319
320 //If tag self-terminates, add an explicit, separate closing tag
321 if(data.substr(-1) === "/" || (name in emptyTags && !this._options.xmlMode)){
322 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
323 } else {
324 if(type !== ElementType.Tag){
325 this._contentFlags |= SpecialTags[type];
326 this._wroteSpecial = false;
327 }
328 this._stack.push(name);
329 }
330};
331
332Parser.prototype._handleError = function(error){
333 error = new Error(error);
334 if(this._cbs.onerror) this._cbs.onerror(error);
335 else throw error;
336};
337
338module.exports = Parser;
\No newline at end of file