UNPKG

9.7 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 this._options = options || defaultOpts;
5 this._cbs = cbs || defaultCbs;
6 this._buffer = "";
7 this._tagSep = "";
8 this._stack = [];
9 this._wroteSpecial = false;
10 this._contentFlags = 0;
11 this._done = false;
12 this._running = true; //false if paused
13}
14
15//Regular expressions used for cleaning up and parsing (stateless)
16var _reAttrib = /\s([^\s\/]+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
17 _reTail = /\s|\/|$/;
18
19var defaultOpts = {
20 xmlMode: false, //Special behavior for script/style tags by default
21 lowerCaseAttributeNames: false, //call .toLowerCase for each attribute name
22 lowerCaseTags: false //call .toLowerCase for each tag name
23};
24
25var defaultCbs = {
26 /*
27 This is just a plain object
28 so that the parser doesn't
29 throw if no arguments were
30 provided.
31 */
32 /*
33 oncdataend,
34 oncdatastart,
35 onclosetag,
36 oncomment,
37 oncommentend,
38 onerror,
39 onopentag,
40 onopentagend,
41 onprocessinginstruction,
42 onreset,
43 ontext
44 */
45};
46
47//Parses a complete HTML and pushes it to the handler
48Parser.prototype.parseComplete = function(data){
49 this.reset();
50 this.end(data);
51};
52
53//Parses a piece of an HTML document
54Parser.prototype.parseChunk =
55Parser.prototype.write = function(data){
56 if(this._done) this._handleError("Attempted to parse chunk after parsing already done");
57 this._buffer += data; //FIXME: this can be a bottleneck
58 if(this._running) this._parseTags();
59};
60
61//Tells the parser that the HTML being parsed is complete
62Parser.prototype.done =
63Parser.prototype.end = function(chunk){
64 if(this._done) return;
65
66 if(chunk) this.write(chunk);
67 this._done = true;
68
69 if(this._running) this._finishParsing();
70};
71
72Parser.prototype._finishParsing = function(){
73 //Parse the buffer to its end
74 if(this._buffer) this._parseTags(true);
75
76 if(this._cbs.onclosetag){
77 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
78 }
79
80 if(this._cbs.onend) this._cbs.onend();
81};
82
83Parser.prototype.pause = function(){
84 if(!this._done) this._running = false;
85};
86
87Parser.prototype.resume = function(){
88 if(this._running) return;
89 this._running = true;
90 this._parseTags();
91 if(this._done) this._finishParsing();
92};
93
94//Resets the parser to a blank state, ready to parse a new HTML document
95Parser.prototype.reset = function(){
96 Parser.call(this, this._cbs, this._options);
97 if(this._cbs.onreset) this._cbs.onreset();
98};
99
100//Extracts the base tag name from the data value of an element
101Parser.prototype._parseTagName = function(data){
102 var match = data.substr(0, data.search(_reTail));
103 if(!this._options.lowerCaseTags) return match;
104 return match.toLowerCase();
105};
106
107//Special tags that are treated differently
108var SpecialTags = {};
109//SpecialTags[ElementType.Tag] = 0x0;
110SpecialTags[ElementType.Style] = 0x1; //2^0
111SpecialTags[ElementType.Script] = 0x2; //2^1
112SpecialTags[ElementType.Comment] = 0x4; //2^2
113SpecialTags[ElementType.CDATA] = 0x8; //2^3
114
115var TagValues = {
116 style: 1,
117 script: 2
118};
119
120//Parses through HTML text and returns an array of found elements
121Parser.prototype._parseTags = function(force){
122 var current = 0,
123 opening = this._buffer.indexOf("<"),
124 closing = this._buffer.indexOf(">"),
125 next, rawData, elementData, lastTagSep;
126
127 //if force is true, parse everything
128 if(force) opening = Infinity;
129
130 //opening !== closing is just false if both are -1
131 while(opening !== closing && this._running){
132 lastTagSep = this._tagSep;
133
134 if((opening !== -1 && opening < closing) || closing === -1){
135 next = opening;
136 this._tagSep = "<";
137 opening = this._buffer.indexOf("<", next + 1);
138 }
139 else{
140 next = closing;
141 this._tagSep = ">";
142 closing = this._buffer.indexOf(">", next + 1);
143 }
144 rawData = this._buffer.substring(current, next); //The next chunk of data to parse
145
146 //set elements for next run
147 current = next + 1;
148
149 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
150 // We're inside a CDATA section
151 this._writeCDATA(rawData);
152
153 }
154 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
155 //We're in a comment tag
156 this._writeComment(rawData);
157 }
158 else if(lastTagSep === "<"){
159 elementData = rawData.trimLeft();
160 if(elementData.charAt(0) === "/"){
161 //elementData = elementData.substr(1).trim();
162 elementData = this._parseTagName(elementData.substr(1));
163 if(this._contentFlags !== 0){
164 //if it's a closing tag, remove the flag
165 if(this._contentFlags & TagValues[elementData]){
166 //remove the flag
167 this._contentFlags ^= TagValues[elementData];
168 } else {
169 this._writeSpecial(rawData, lastTagSep);
170 continue;
171 }
172 }
173 this._processCloseTag(elementData);
174 }
175 else if(elementData.charAt(0) === "!"){
176 if(elementData.substr(1, 7) === "[CDATA["){
177 this._contentFlags |= SpecialTags[ElementType.CDATA];
178 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
179 this._writeCDATA(elementData.substr(8));
180 }
181 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
182 else if(elementData.substr(1, 2) === "--"){
183 //This tag is a comment
184 this._contentFlags |= SpecialTags[ElementType.Comment];
185 this._writeComment(rawData.substr(3));
186 }
187 //TODO: This isn't a processing instruction, needs a new name
188 else if(this._cbs.onprocessinginstruction){
189 this._cbs.onprocessinginstruction(
190 "!" + this._parseTagName(elementData.substr(1)),
191 elementData
192 );
193 }
194 }
195 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
196 else if(elementData.charAt(0) === "?"){
197 if(this._cbs.onprocessinginstruction){
198 this._cbs.onprocessinginstruction(
199 "?" + this._parseTagName(elementData.substr(1)),
200 elementData
201 );
202 }
203 }
204 else this._processOpenTag(elementData);
205 }
206 else{
207 if(this._contentFlags !== 0){
208 this._writeSpecial(rawData, ">");
209 }
210 else if(rawData !== "" && this._cbs.ontext){
211 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
212 this._cbs.ontext(rawData);
213 }
214 }
215 }
216
217 this._buffer = this._buffer.substr(current);
218};
219
220Parser.prototype._writeCDATA = function(data){
221 if(this._tagSep === ">" && data.substr(-2) === "]]"){
222 // CDATA ends
223 if(data.length !== 2 && this._cbs.ontext){
224 this._cbs.ontext(data.slice(0,-2));
225 }
226 this._contentFlags ^= SpecialTags[ElementType.CDATA];
227 if(this._cbs.oncdataend) this._cbs.oncdataend();
228 }
229 else if(this._cbs.ontext) this._cbs.ontext(data + this._tagSep);
230};
231
232Parser.prototype._writeComment = function(rawData){
233 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
234 //remove the written flag (also removes the comment flag)
235 this._contentFlags ^= SpecialTags[ElementType.Comment];
236 this._wroteSpecial = false;
237 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
238 if(this._cbs.oncommentend) this._cbs.oncommentend();
239 }
240 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
241};
242
243Parser.prototype._writeSpecial = function(rawData, lastTagSep){
244 //if the previous element is text, append the last tag sep to element
245 if(this._wroteSpecial){
246 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
247 }
248 else{ //The previous element was not text
249 this._wroteSpecial = true;
250 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
251 }
252};
253
254var emptyTags = {
255 __proto__: null,
256 area: true,
257 base: true,
258 basefont: true,
259 br: true,
260 col: true,
261 frame: true,
262 hr: true,
263 img: true,
264 input: true,
265 isindex: true,
266 link: true,
267 meta: true,
268 param: true,
269 embed: true
270};
271
272Parser.prototype._processCloseTag = function(name){
273 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
274 var pos = this._stack.lastIndexOf(name);
275 if(pos !== -1)
276 if(this._cbs.onclosetag){
277 pos = this._stack.length - pos;
278 while(pos--) this._cbs.onclosetag(this._stack.pop());
279 }
280 else this._stack.splice(pos);
281 }
282 //many browsers (eg. Safari, Chrome) convert </br> to <br>
283 else if(name === "br" && !this._options.xmlMode){
284 this._processOpenTag(name + "/");
285 }
286};
287
288Parser.prototype._parseAttributes = function(data, lcNames){
289 for(var match; match = _reAttrib.exec(data);){
290 this._cbs.onattribute(lcNames ? match[1].toLowerCase() : match[1], match[2] || match[3] || match[4] || "");
291 }
292};
293
294//parses the attribute string
295var parseAttributes = function(data, lcNames){
296 var attrs = {};
297 for(var match; match = _reAttrib.exec(data);){
298 attrs[lcNames ? match[1].toLowerCase() : match[1]] = match[2] || match[3] || match[4] || "";
299 }
300 return attrs;
301};
302
303Parser.prototype._processOpenTag = function(data){
304 var name = this._parseTagName(data),
305 type = ElementType.Tag;
306
307 if(this._options.xmlMode){ /*do nothing*/ }
308 else if(name === "script") type = ElementType.Script;
309 else if(name === "style") type = ElementType.Style;
310
311 if(this._cbs.onopentagname) this._cbs.onopentagname(name);
312 if(this._cbs.onopentag){
313 this._cbs.onopentag(name, parseAttributes(
314 data, this._options.lowerCaseAttributeNames
315 ));
316 }
317 if(this._cbs.onattribute){
318 this._parseAttributes(data, this._options.lowerCaseAttributeNames);
319 }
320 if(this._cbs.onopentagend) this._cbs.onopentagend();
321
322 //If tag self-terminates, add an explicit, separate closing tag
323 if(data.substr(-1) === "/" || (name in emptyTags && !this._options.xmlMode)){
324 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
325 } else {
326 if(type !== ElementType.Tag){
327 this._contentFlags |= SpecialTags[type];
328 this._wroteSpecial = false;
329 }
330 this._stack.push(name);
331 }
332};
333
334Parser.prototype._handleError = function(error){
335 error = new Error(error);
336 if(this._cbs.onerror) this._cbs.onerror(error);
337 else throw error;
338};
339
340module.exports = Parser;
\No newline at end of file