UNPKG

8.19 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 if(options) this._options = options;
5 if(cbs) this._cbs = cbs;
6
7 this._buffer = "";
8 this._prevTagSep = "";
9 this._stack = [];
10 this._contentFlags = 0;
11 this._done = false;
12 this._parseState = ElementType.Text;
13}
14
15//**"Static"**//
16//Regular expressions used for cleaning up and parsing (stateless)
17var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames
18var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//"
19
20Parser.prototype._options = {
21 xmlMode: false, //Special behaviour for script/style tags by default
22 lowerCaseTags: false //call .toLowerCase for each tagname
23};
24
25Parser.prototype._cbs = {
26 /*
27 onopentag,
28 onclosetag,
29 ontext,
30 onprocessinginstruction,
31 oncomment
32 */
33};
34
35//**Public**//
36//Methods//
37//Parses a complete HTML and pushes it to the handler
38Parser.prototype.parseComplete = function(data){
39 this.reset();
40 this.parseChunk(data);
41 this.done();
42};
43
44//Parses a piece of an HTML document
45Parser.prototype.write =
46Parser.prototype.parseChunk = function(data){
47 if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done"));
48 this._buffer += data; //FIXME: this can be a bottleneck
49 this._parseTags();
50};
51
52//Tells the parser that the HTML being parsed is complete
53Parser.prototype.done = function(){
54 if(this._done) return;
55 this._done = true;
56
57 //Parse the buffer to its end
58 if(this._buffer) this._parseTags(true);
59
60 if(this._cbs.onclosetag){
61 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
62 }
63
64 if(this._cbs.onend) this._cbs.onend();
65};
66
67//Resets the parser to a blank state, ready to parse a new HTML document
68Parser.prototype.reset = function(){
69 Parser.call(this);
70 if(this._cbs.onreset) this._cbs.onreset();
71};
72
73//**Private**//
74//Takes an element and adds an "attribs" property for any element attributes found
75var parseAttributes = function(data){
76 var pos = data.search(/\s/), attrs = {}; //Find any whitespace
77 if(pos === -1) return attrs;
78 var attribRaw = data.substr(pos);
79
80 _reAttrib.lastIndex = 0;
81 var match;
82
83 while(match = _reAttrib.exec(attribRaw)){
84 if(match[1]) attrs[match[1]] = match[2];
85 else if(match[3]) attrs[match[3]] = match[4];
86 else if(match[5]) attrs[match[5]] = match[6];
87 else if(match[7]) attrs[match[7]] = match[7];
88 }
89
90 return attrs;
91};
92
93//Extracts the base tag name from the data value of an element
94Parser.prototype._parseTagName = function(data){
95 var match = data.match(_reTagName);
96 if(match === null) return "";
97 if(this._options.lowerCaseTags){
98 return match[1] + match[2].toLowerCase();
99 }
100 else return match[1] + match[2];
101};
102
103//Special tags that are threated differently
104var SpecialTags = {};
105SpecialTags[ElementType.Tag] = 0;
106SpecialTags[ElementType.Style] = 1; //2^0
107SpecialTags[ElementType.Script] = 2; //2^1
108SpecialTags.w = 4; //2^2 - if set, append prev tag sep to data
109SpecialTags[ElementType.Comment] = 8; //2^3
110
111//Parses through HTML text and returns an array of found elements
112Parser.prototype._parseTags = function(force){
113 var buffer = this._buffer, current = 0;
114
115 var next, tagSep, rawData, elementName, elementType, elementData;
116
117 var opening = buffer.indexOf("<"), closing = buffer.indexOf(">");
118
119 //if force is true, parse everything
120 if(force) opening = Infinity;
121
122 while(opening !== closing){ //just false if both are -1
123 if((opening !== -1 && opening < closing) || closing === -1){
124 next = opening;
125 tagSep = "<";
126 opening = buffer.indexOf(tagSep, next + 1);
127 }
128 else{
129 next = closing;
130 tagSep = ">";
131 closing = buffer.indexOf(tagSep, next + 1);
132 }
133 rawData = buffer.substring(current, next); //The next chunk of data to parse
134 elementType = this._parseState;
135
136 //set elements for next run
137 current = next + 1;
138 this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text;
139
140 if(elementType === ElementType.Tag){
141 elementData = rawData.trim();
142 elementName = this._parseTagName(elementData);
143 }
144 else{
145 elementData = rawData;
146 elementName = "";
147 }
148
149 //This section inspects the current tag stack and modifies the current
150 //element if we're actually parsing a special area (script/comment/style tag)
151 if(this._contentFlags === 0){ /*do nothing*/ }
152 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
153 //We're currently in a comment tag
154 this._processComment(rawData, tagSep);
155 continue;
156 }
157 //if it's a closing tag, remove the flag
158 else if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "/script"){
159 //remove the script flag (also removes the written flag)
160 this._contentFlags %= SpecialTags[ElementType.Script];
161 }
162 else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "/style"){
163 //remove the style flag (also removes the written flag)
164 this._contentFlags %= SpecialTags[ElementType.Style];
165 }
166 //special behaviour for script & style tags
167 //Make sure we're not in a comment
168 else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){
169 //If the previous element is text, append the last tag sep to element
170 if(this._contentFlags >= SpecialTags.w){
171 if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData);
172 }
173 else{ //The previous element was not text
174 this._contentFlags += SpecialTags.w;
175 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
176 }
177 this._prevTagSep = tagSep;
178 continue;
179 }
180
181 //Processing of non-special tags
182 if(elementType === ElementType.Tag){
183 if(rawData.substring(0, 3) === "!--"){ //This tag is a comment
184 this._contentFlags += SpecialTags[ElementType.Comment];
185 this._processComment(rawData.substr(3), tagSep);
186 continue;
187 }
188
189 if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){
190 //ElementType.Directive
191 //TODO: what about CDATA?
192 if(this._cbs.onprocessinginstruction){
193 this._cbs.onprocessinginstruction(elementName, elementData);
194 }
195 continue;
196 }
197 if(elementName.charAt(0) === "/") this._processCloseTag(elementName.substr(1));
198 else this._processOpenTag(elementName, elementData, tagSep);
199 }
200 else if(elementType === ElementType.Text && rawData !== "" && this._cbs.ontext){
201 this._cbs.ontext(elementData);
202 }
203 }
204
205 this._buffer = buffer.substring(current);
206};
207
208Parser.prototype._processComment = function(rawData, tagSep){
209 this._prevTagSep = tagSep;
210
211 if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
212 //remove the written flag (also removes the comment flag)
213 this._contentFlags %= SpecialTags.w;
214 rawData = rawData.slice(0, -2);
215 }
216 else rawData += tagSep;
217
218 if(this._cbs.oncomment) this._cbs.oncomment(rawData);
219};
220
221var emptyTags = require("./ClosingTags.js").self;
222
223Parser.prototype._isEmptyTag = function(name){
224 return !this._options.xmlMode && emptyTags[name];
225};
226
227Parser.prototype._processCloseTag = function(name){
228 if(this._stack && !this._isEmptyTag(name)){
229 var i = this._stack.length;
230 while(i !== 0 && this._stack[--i] !== name){}
231 if(i !== 0 || this._stack[0] === name)
232 if(this._cbs.onclosetag){
233 while(i < this._stack.length)
234 this._cbs.onclosetag(this._stack.pop());
235 }
236 else this._stack.splice(i);
237 }
238 //many browsers (eg. Safari, Chrome) convert </br> to <br>
239 else if(name === "br" && !this._options.xmlMode)
240 this._processOpenTag(name, "/");
241};
242
243Parser.prototype._processOpenTag = function(name, data, tagSep){
244 var type = ElementType.Tag;
245 if(this._options.xmlMode){ /*do nothing*/ }
246 else if(name === "script") type = ElementType.Script;
247 else if(name === "style") type = ElementType.Style;
248
249 if(this._cbs.onopentag){
250 this._cbs.onopentag(name, parseAttributes(data), type);
251 }
252
253 //If tag self-terminates, add an explicit, separate closing tag
254 if(data.substr(-1) === "/" || this._isEmptyTag(name)){
255 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
256 } else {
257 this._contentFlags += SpecialTags[type];
258 this._stack.push(name);
259 this._prevTagSep = tagSep;
260 }
261};
262
263Parser.prototype._handleError = function(error){
264 if(this._cbs.onerror)
265 this._cbs.onerror(error);
266 else throw error;
267};
268
269module.exports = Parser;
\No newline at end of file