UNPKG

9.39 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 if(options) this._options = options;
5 if(cbs) this._cbs = cbs;
6
7 this._buffer = "";
8 this._tagSep = "";
9 this._stack = [];
10 this._wroteSpecial = false;
11 this._contentFlags = 0;
12 this._done = false;
13 this._running = true; //false if paused
14}
15
16//Regular expressions used for cleaning up and parsing (stateless)
17var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
18 _reTail = /\s|\/|$/;
19
20Parser.prototype._options = {
21 xmlMode: false, //Special behavior for script/style tags by default
22 lowerCaseTags: false //call .toLowerCase for each tag name
23};
24
25Parser.prototype._cbs = {
26 /*
27 oncdataend,
28 oncdatastart,
29 onclosetag,
30 oncomment,
31 oncommentend,
32 onerror,
33 onopentag,
34 onprocessinginstruction,
35 onreset,
36 ontext
37 */
38};
39
40//Parses a complete HTML and pushes it to the handler
41Parser.prototype.parseComplete = function(data){
42 this.reset();
43 this.write(data);
44 this.end();
45};
46
47//Parses a piece of an HTML document
48Parser.prototype.parseChunk =
49Parser.prototype.write = function(data){
50 if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done"));
51 this._buffer += data; //FIXME: this can be a bottleneck
52 if(this._running) this._parseTags();
53};
54
55//Tells the parser that the HTML being parsed is complete
56Parser.prototype.done =
57Parser.prototype.end = function(chunk){
58 if(this._done) return;
59
60 if(chunk) this.write(chunk);
61 this._done = true;
62
63 if(this._running) this._finishParsing();
64};
65
66Parser.prototype._finishParsing = function(){
67 //Parse the buffer to its end
68 if(this._buffer) this._parseTags(true);
69
70 if(this._cbs.onclosetag){
71 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
72 }
73
74 if(this._cbs.onend) this._cbs.onend();
75};
76
77Parser.prototype.pause = function(){
78 if(!this._done) this._running = false;
79};
80
81Parser.prototype.resume = function(){
82 if(this._running) return;
83 this._running = true;
84 this._parseTags();
85 if(this._done) this._finishParsing();
86};
87
88//Resets the parser to a blank state, ready to parse a new HTML document
89Parser.prototype.reset = function(){
90 Parser.call(this);
91 if(this._cbs.onreset) this._cbs.onreset();
92};
93
94//parses the attribute string
95var parseAttributes = function(data){
96 var attrs = {}, match;
97
98 while(match = _reAttrib.exec(data)){
99 attrs[match[1]] = match[2] || match[3] || match[4] || match[1];
100 }
101
102 return attrs;
103};
104
105//Extracts the base tag name from the data value of an element
106Parser.prototype._parseTagName = function(data){
107 var match = data.substr(0, data.search(_reTail));
108 if(!this._options.lowerCaseTags) return match;
109 return match.toLowerCase();
110};
111
112//Special tags that are treated differently
113var SpecialTags = {};
114//SpecialTags[ElementType.Tag] = 0;
115SpecialTags[ElementType.Style] = 1; //2^0
116SpecialTags[ElementType.Script] = 2; //2^1
117SpecialTags[ElementType.Comment] = 4; //2^2
118SpecialTags[ElementType.CDATA] = 8; //2^3
119
120var TagValues = {
121 style: 1,
122 script: 2
123};
124
125//Parses through HTML text and returns an array of found elements
126Parser.prototype._parseTags = function(force){
127 var buffer = this._buffer, current = 0;
128
129 var next, rawData, elementData, lastTagSep;
130
131 var opening = buffer.indexOf("<"), closing = buffer.indexOf(">");
132
133 //if force is true, parse everything
134 if(force) opening = 1/0;
135
136 //opening !== closing is just false if both are -1
137 while(opening !== closing && this._running){
138 lastTagSep = this._tagSep;
139
140 if((opening !== -1 && opening < closing) || closing === -1){
141 next = opening;
142 this._tagSep = "<";
143 opening = buffer.indexOf("<", next + 1);
144 }
145 else{
146 next = closing;
147 this._tagSep = ">";
148 closing = buffer.indexOf(">", next + 1);
149 }
150 rawData = buffer.substring(current, next); //The next chunk of data to parse
151
152 //set elements for next run
153 current = next + 1;
154
155 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
156 if(this._tagSep === ">" && rawData.substr(-2) === "]]"){
157 if(rawData.length !== 2 && this._cbs.ontext){
158 this._cbs.ontext(rawData.slice(0,-2));
159 }
160 this._contentFlags -= SpecialTags[ElementType.CDATA];
161 if(this._cbs.oncdataend) this._cbs.oncdataend();
162 }
163 else if(this._cbs.ontext) this._cbs.ontext(rawData + this._tagSep);
164 }
165 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
166 //We're currently in a comment tag
167 this._processComment(rawData);
168 }
169 else if(lastTagSep === "<"){
170 elementData = rawData.trimLeft();
171 if(elementData.charAt(0) === "/"){
172 //elementData = elementData.substr(1).trim();
173 elementData = this._parseTagName(elementData.substr(1));
174 if(this._contentFlags !== 0){
175 //if it's a closing tag, remove the flag
176 if(this._contentFlags >= TagValues[elementData]){
177 //remove the flag
178 this._contentFlags -= TagValues[elementData];
179 }
180 else {
181 this._writeSpecial(rawData, lastTagSep);
182 continue;
183 }
184 }
185 this._processCloseTag(elementData);
186 }
187 else if(elementData.charAt(0) === "!"){
188 if(elementData.substr(1, 2) === "--"){
189 //This tag is a comment
190 this._contentFlags += SpecialTags[ElementType.Comment];
191 this._processComment(rawData.substr(3));
192 }
193 else if(elementData.substr(1, 7) === "[CDATA["){
194 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
195 if(this._tagSep === ">" && elementData.substr(-2) === "]]"){
196 if(this._cbs.oncdataend) this._cbs.oncdataend();
197 if(this._cbs.ontext) this._cbs.ontext(elementData.slice(8, -2));
198 }
199 else{
200 if(this._cbs.ontext) this._cbs.ontext(elementData.substr(8) + this._tagSep);
201 this._contentFlags += SpecialTags[ElementType.CDATA];
202 }
203 }
204 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
205 //TODO: This isn't a processing instruction, needs a new name
206 else if(this._cbs.onprocessinginstruction){
207 this._cbs.onprocessinginstruction(
208 "!" + this._parseTagName(elementData.substr(1)),
209 elementData
210 );
211 }
212 }
213 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
214 else if(elementData.charAt(0) === "?"){
215 if(this._cbs.onprocessinginstruction){
216 this._cbs.onprocessinginstruction(
217 "?" + this._parseTagName(elementData.substr(1)),
218 elementData
219 );
220 }
221 }
222 else this._processOpenTag(this._parseTagName(elementData), elementData);
223 }
224 else{
225 if(this._contentFlags !== 0){
226 this._writeSpecial(rawData, ">");
227 }
228 else if(rawData !== "" && this._cbs.ontext){
229 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
230 this._cbs.ontext(rawData);
231 }
232 }
233 }
234
235 this._buffer = buffer.substring(current);
236};
237
238Parser.prototype._processComment = function(rawData){
239 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
240 //remove the written flag (also removes the comment flag)
241 this._contentFlags -= SpecialTags[ElementType.Comment];
242 this._wroteSpecial = false;
243 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
244 if(this._cbs.oncommentend) this._cbs.oncommentend();
245 }
246 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
247};
248
249Parser.prototype._writeSpecial = function(rawData, lastTagSep){
250 //if the previous element is text, append the last tag sep to element
251 if(this._wroteSpecial){
252 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
253 }
254 else{ //The previous element was not text
255 this._wroteSpecial = true;
256 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
257 }
258};
259
260var emptyTags = {
261 area: true,
262 base: true,
263 basefont: true,
264 br: true,
265 col: true,
266 frame: true,
267 hr: true,
268 img: true,
269 input: true,
270 isindex: true,
271 link: true,
272 meta: true,
273 param: true,
274 embed: true
275};
276
277Parser.prototype._processCloseTag = function(name){
278 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
279 var pos = this._stack.lastIndexOf(name);
280 if(pos !== -1)
281 if(this._cbs.onclosetag){
282 pos = this._stack.length - pos;
283 while(pos--) this._cbs.onclosetag(this._stack.pop());
284 }
285 else this._stack.splice(pos);
286 }
287 //many browsers (eg. Safari, Chrome) convert </br> to <br>
288 else if(name === "br" && !this._options.xmlMode)
289 this._processOpenTag(name, "/");
290};
291
292Parser.prototype._parseAttributes = function(data){
293 for(var match; match = _reAttrib.exec(data);){
294 this._cbs.onattribute(match[1], match[2] || match[3] || match[4] || match[1]);
295 }
296};
297
298Parser.prototype._processOpenTag = function(name, data){
299 var type = ElementType.Tag;
300 if(this._options.xmlMode){ /*do nothing*/ }
301 else if(name === "script") type = ElementType.Script;
302 else if(name === "style") type = ElementType.Style;
303
304 if(this._cbs.onopentagname){
305 this._cbs.onopentagname(name);
306 }
307 if(this._cbs.onopentag){
308 this._cbs.onopentag(name, parseAttributes(data), type);
309 }
310 if(this._cbs.onattribute){
311 this._parseAttributes(data);
312 }
313
314 //If tag self-terminates, add an explicit, separate closing tag
315 if(data.substr(-1) === "/" || (name in emptyTags && !this._options.xmlMode)){
316 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
317 } else {
318 if(type !== ElementType.Tag){
319 this._contentFlags += SpecialTags[type];
320 this._wroteSpecial = false;
321 }
322 this._stack.push(name);
323 }
324};
325
326Parser.prototype._handleError = function(error){
327 if(this._cbs.onerror)
328 this._cbs.onerror(error);
329 else throw error;
330};
331
332module.exports = Parser;
\No newline at end of file