UNPKG

8.73 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 if(options) this._options = options;
5 if(cbs) this._cbs = cbs;
6
7 this._buffer = "";
8 this._tagSep = "";
9 this._stack = [];
10 this._wroteSpecial = false;
11 this._contentFlags = 0;
12 this._done = false;
13}
14
15//Regular expressions used for cleaning up and parsing (stateless)
16var _reAttrib = /\s(\S+?)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+))|(?=\s)|\/|$)/g,
17 _reTail = /\s|\/|$/;
18
19Parser.prototype._options = {
20 xmlMode: false, //Special behavior for script/style tags by default
21 lowerCaseTags: false //call .toLowerCase for each tag name
22};
23
24Parser.prototype._cbs = {
25 /*
26 oncdataend,
27 oncdatastart,
28 onclosetag,
29 oncomment,
30 oncommentend,
31 onerror,
32 onopentag,
33 onprocessinginstruction,
34 onreset,
35 ontext
36 */
37};
38
39//Parses a complete HTML and pushes it to the handler
40Parser.prototype.parseComplete = function(data){
41 this.reset();
42 this.write(data);
43 this.end();
44};
45
46//Parses a piece of an HTML document
47Parser.prototype.write =
48Parser.prototype.parseChunk = function(data){
49 if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done"));
50 this._buffer += data; //FIXME: this can be a bottleneck
51 this._parseTags();
52};
53
54//Tells the parser that the HTML being parsed is complete
55Parser.prototype.end = Parser.prototype.done = function(chunk){
56 if(this._done) return;
57
58 if(chunk) this.write(chunk);
59 this._done = true;
60
61 //Parse the buffer to its end
62 if(this._buffer) this._parseTags(true);
63
64 if(this._cbs.onclosetag){
65 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
66 }
67
68 if(this._cbs.onend) this._cbs.onend();
69};
70
71//Resets the parser to a blank state, ready to parse a new HTML document
72Parser.prototype.reset = function(){
73 Parser.call(this);
74 if(this._cbs.onreset) this._cbs.onreset();
75};
76
77//parses the attribute string
78var parseAttributes = function(data){
79 var attrs = {}, match;
80
81 while(match = _reAttrib.exec(data)){
82 attrs[match[1]] = match[2] || match[3] || match[4] || match[1];
83 }
84
85 return attrs;
86};
87
88//Extracts the base tag name from the data value of an element
89Parser.prototype._parseTagName = function(data){
90 var match = data.substr(0, data.search(_reTail));
91 if(!this._options.lowerCaseTags) return match;
92 return match.toLowerCase();
93};
94
95//Special tags that are treated differently
96var SpecialTags = {};
97//SpecialTags[ElementType.Tag] = 0;
98SpecialTags[ElementType.Style] = 1; //2^0
99SpecialTags[ElementType.Script] = 2; //2^1
100SpecialTags[ElementType.Comment] = 4; //2^2
101SpecialTags[ElementType.CDATA] = 8; //2^3
102
103//Parses through HTML text and returns an array of found elements
104Parser.prototype._parseTags = function(force){
105 var buffer = this._buffer, current = 0;
106
107 var next, rawData, elementData, lastTagSep;
108
109 var opening = buffer.indexOf("<"), closing = buffer.indexOf(">");
110
111 //if force is true, parse everything
112 if(force) opening = 1/0;
113
114 while(opening !== closing){ //just false if both are -1
115 lastTagSep = this._tagSep;
116
117 if((opening !== -1 && opening < closing) || closing === -1){
118 next = opening;
119 this._tagSep = "<";
120 opening = buffer.indexOf("<", next + 1);
121 }
122 else{
123 next = closing;
124 this._tagSep = ">";
125 closing = buffer.indexOf(">", next + 1);
126 }
127 rawData = buffer.substring(current, next); //The next chunk of data to parse
128
129 //set elements for next run
130 current = next + 1;
131
132 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
133 if(this._tagSep === ">" && rawData.substr(-2) === "]]"){
134 if(rawData.length !== 2 && this._cbs.ontext){
135 this._cbs.ontext(rawData.slice(0,-2));
136 }
137 this._contentFlags -= SpecialTags[ElementType.CDATA];
138 if(this._cbs.oncdataend) this._cbs.oncdataend();
139 }
140 else if(this._cbs.ontext) this._cbs.ontext(rawData + this._tagSep);
141 }
142 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
143 //We're currently in a comment tag
144 this._processComment(rawData);
145 }
146 else if(lastTagSep === "<"){
147 elementData = rawData.trimLeft();
148 if(elementData.charAt(0) === "/"){
149 //elementData = elementData.substr(1).trim();
150 elementData = this._parseTagName(elementData.substr(1));
151 if(this._contentFlags !== 0){
152 //if it's a closing tag, remove the flag
153 if(this._contentFlags === SpecialTags[ElementType.Script] && elementData === "script"){
154 //remove the script flag
155 this._contentFlags -= SpecialTags[ElementType.Script];
156 }
157 else if(this._contentFlags === SpecialTags[ElementType.Style] && elementData === "style"){
158 //remove the style flag
159 this._contentFlags -= SpecialTags[ElementType.Style];
160 }
161 else {
162 this._writeSpecial(rawData, lastTagSep);
163 continue;
164 }
165 }
166 this._processCloseTag(elementData);
167 }
168 else if(elementData.charAt(0) === "!"){
169 if(elementData.substr(1, 2) === "--"){
170 //This tag is a comment
171 this._contentFlags += SpecialTags[ElementType.Comment];
172 this._processComment(rawData.substr(3));
173 }
174 else if(elementData.substr(1, 7) === "[CDATA["){
175 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
176 if(this._tagSep === ">" && elementData.substr(-2) === "]]"){
177 if(this._cbs.oncdataend) this._cbs.oncdataend();
178 if(this._cbs.ontext) this._cbs.ontext(elementData.slice(8, -2));
179 }
180 else{
181 if(this._cbs.ontext) this._cbs.ontext(elementData.substr(8));
182 this._contentFlags += SpecialTags[ElementType.CDATA];
183 }
184 }
185 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
186 //This tag is a directive
187 else if(this._cbs.onprocessinginstruction){
188 this._cbs.onprocessinginstruction(
189 "!" + this._parseTagName(elementData.substr(1)),
190 elementData
191 );
192 }
193 }
194 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
195 else if(elementData.charAt(0) === "?"){
196 if(this._cbs.onprocessinginstruction){
197 this._cbs.onprocessinginstruction(
198 "?" + this._parseTagName(elementData.substr(1)),
199 elementData
200 );
201 }
202 }
203 else this._processOpenTag(this._parseTagName(elementData), elementData);
204 }
205 else{
206 if(this._contentFlags !== 0){
207 this._writeSpecial(rawData, lastTagSep);
208 }
209 else if(rawData !== "" && this._cbs.ontext){
210 this._cbs.ontext(rawData);
211 }
212 }
213 }
214
215 this._buffer = buffer.substring(current);
216};
217
218Parser.prototype._processComment = function(rawData){
219 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
220 //remove the written flag (also removes the comment flag)
221 this._contentFlags -= SpecialTags[ElementType.Comment];
222 this._wroteSpecial = false;
223 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
224 if(this._cbs.oncommentend) this._cbs.oncommentend();
225 }
226 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
227};
228
229Parser.prototype._writeSpecial = function(rawData, lastTagSep){
230 //if the previous element is text, append the last tag sep to element
231 if(this._wroteSpecial){
232 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
233 }
234 else{ //The previous element was not text
235 this._wroteSpecial = true;
236 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
237 }
238};
239
240var emptyTags = {
241 area: true,
242 base: true,
243 basefont: true,
244 br: true,
245 col: true,
246 frame: true,
247 hr: true,
248 img: true,
249 input: true,
250 isindex: true,
251 link: true,
252 meta: true,
253 param: true,
254 embed: true
255};
256
257Parser.prototype._processCloseTag = function(name){
258 if(this._stack && (!emptyTags[name] || this._options.xmlMode)){
259 var pos = this._stack.lastIndexOf(name);
260 if(pos !== -1)
261 if(this._cbs.onclosetag){
262 pos = this._stack.length - pos;
263 while(pos--) this._cbs.onclosetag(this._stack.pop());
264 }
265 else this._stack.splice(pos);
266 }
267 //many browsers (eg. Safari, Chrome) convert </br> to <br>
268 else if(name === "br" && !this._options.xmlMode)
269 this._processOpenTag(name, "/");
270};
271
272Parser.prototype._processOpenTag = function(name, data){
273 var type = ElementType.Tag;
274 if(this._options.xmlMode){ /*do nothing*/ }
275 else if(name === "script") type = ElementType.Script;
276 else if(name === "style") type = ElementType.Style;
277
278 if(this._cbs.onopentag){
279 this._cbs.onopentag(name, parseAttributes(data), type);
280 }
281
282 //If tag self-terminates, add an explicit, separate closing tag
283 if(data.substr(-1) === "/" || (emptyTags[name] && !this._options.xmlMode)){
284 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
285 } else {
286 if(type !== ElementType.Tag){
287 this._contentFlags += SpecialTags[type];
288 this._wroteSpecial = false;
289 }
290 this._stack.push(name);
291 }
292};
293
294Parser.prototype._handleError = function(error){
295 if(this._cbs.onerror)
296 this._cbs.onerror(error);
297 else throw error;
298};
299
300module.exports = Parser;
\No newline at end of file