UNPKG

12.1 kBJavaScriptView Raw
1var ElementType = require("./ElementType.js");
2
3function Parser(cbs, options){
4 this._options = options || defaultOpts;
5 this._cbs = cbs || defaultCbs;
6 this._buffer = "";
7 this._tagSep = ">";
8 this._stack = [];
9 this._wroteSpecial = false;
10 this._contentFlags = 0;
11 this._done = false;
12 this._running = true; //false if paused
13}
14
15//Regular expressions used for cleaning up and parsing (stateless)
16
17/* http://dev.w3.org/html5/html-author/#attributes
18 * - Whitespace is permitted after the tag name, but it is not permitted before the tag name.
19 * - Attribute names must consist of one or more characters other than the space characters,
20 * control characters, NULL, one of the characters: double quote ("), single quote ('),
21 * greater-than sign (>), solidus (/), equals sign (=), nor any characters that are not defined by Unicode.
22 * - An empty attribute is one where the value has been omitted. (<input disabled>...</input>
23 * - An unquoted attribute value must not contain any literal space characters, any of the characters:
24 * double quote ("), apostrophe ('), equals sign (=), less-than sign (<), greater-than sign (>),
25 * or grave accent (`), and the value must not be the empty string.
26 * - There may be space characters between the attribute name and the equals sign (=),
27 * and between that and the attribute value.
28 * - Double-quoted attributes must not contain any double-quote characters or ambiguous ampersands.
29 * - Single-quoted attributes must not contain any single-quote characters or ambiguous ampersands.
30 */
31// element name: (<[^<& ]+)
32// attribute name: ( [^"'=>\/]+)
33// attribute value: (\s*=\s*(?:
34// "([^"]*)"|
35// '([^']*)'|
36// [^\s"'=<>`]+)
37// tag end: (?=\s|\/|$)
38
39var _reAttrib = /\s+([^"'=>\/\s]+)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))|(?=\s)|\/|$)/g,
40 _reTail = /\s|\/|$/;
41
42var defaultOpts = {
43 xmlMode: false, //Special behavior for script/style tags by default
44 lowerCaseAttributeNames: false, //call .toLowerCase for each attribute name
45 lowerCaseTags: false //call .toLowerCase for each tag name
46};
47
48var defaultCbs = {
49 /*
50 This is just a plain object
51 so that the parser doesn't
52 throw if no arguments were
53 provided.
54 */
55 /*
56 oncdataend,
57 oncdatastart,
58 onclosetag,
59 oncomment,
60 oncommentend,
61 onerror,
62 onopentag,
63 onopentagend,
64 onprocessinginstruction,
65 onreset,
66 ontext
67 */
68};
69
70var formTags = {
71 input: true,
72 option: true,
73 optgroup: true,
74 select: true,
75 button: true,
76 datalist: true,
77 textarea: true
78};
79var openImpliesClose = {
80 tr : { tr:true, th:true, td:true },
81 th : { th:true },
82 td : { thead:true, td:true },
83 body : { head:true, link:true, script:true },
84 li : { li:true },
85 p : { p:true },
86 select : formTags,
87 input : formTags,
88 output : formTags,
89 button : formTags,
90 datalist: formTags,
91 textarea: formTags,
92 option : { option:true },
93 optgroup: { optgroup:true }
94};
95
96//Parses a complete HTML and pushes it to the handler
97Parser.prototype.parseComplete = function(data){
98 this.reset();
99 this.end(data);
100};
101
102//Parses a piece of an HTML document
103Parser.prototype.parseChunk =
104Parser.prototype.write = function(data){
105 if(this._done) this._handleError("Attempted to parse chunk after parsing already done");
106 this._buffer += data; //FIXME: this can be a bottleneck
107 if(this._running) this._parseTags();
108};
109
110//Tells the parser that the HTML being parsed is complete
111Parser.prototype.done =
112Parser.prototype.end = function(chunk){
113 if(this._done) return;
114
115 if(chunk) this.write(chunk);
116 this._done = true;
117
118 if(this._running) this._finishParsing();
119};
120
121Parser.prototype._finishParsing = function(){
122 //Parse the buffer to its end
123 if(this._buffer) this._parseTags(true);
124
125 if(this._cbs.onclosetag){
126 while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
127 }
128
129 if(this._cbs.onend) this._cbs.onend();
130};
131
132Parser.prototype.pause = function(){
133 if(!this._done) this._running = false;
134};
135
136Parser.prototype.resume = function(){
137 if(this._running) return;
138 this._running = true;
139 this._parseTags();
140 if(this._done) this._finishParsing();
141};
142
143//Resets the parser to a blank state, ready to parse a new HTML document
144Parser.prototype.reset = function(){
145 Parser.call(this, this._cbs, this._options);
146 if(this._cbs.onreset) this._cbs.onreset();
147};
148
149//Extracts the base tag name from the data value of an element
150Parser.prototype._parseTagName = function(data){
151 var match = data.substr(0, data.search(_reTail));
152 if(!this._options.lowerCaseTags) return match;
153 return match.toLowerCase();
154};
155
156//Special tags that are treated differently
157var SpecialTags = {};
158//SpecialTags[ElementType.Tag] = 0x0;
159SpecialTags[ElementType.Style] = 0x1; //2^0
160SpecialTags[ElementType.Script] = 0x2; //2^1
161SpecialTags[ElementType.Comment] = 0x4; //2^2
162SpecialTags[ElementType.CDATA] = 0x8; //2^3
163
164var TagValues = {
165 style: 1,
166 script: 2
167};
168
169//Parses through HTML text and returns an array of found elements
170Parser.prototype._parseTags = function(force){
171 var current = 0,
172 opening = this._buffer.indexOf("<"),
173 closing = this._buffer.indexOf(">"),
174 next, rawData, elementData, lastTagSep;
175
176 //if force is true, parse everything
177 if(force) opening = Infinity;
178
179 //opening !== closing is just false if both are -1
180 while(opening !== closing && this._running){
181 lastTagSep = this._tagSep;
182
183 if((opening !== -1 && opening < closing) || closing === -1){
184 next = opening;
185 this._tagSep = "<";
186 opening = this._buffer.indexOf("<", next + 1);
187 }
188 else{
189 next = closing;
190 this._tagSep = ">";
191 closing = this._buffer.indexOf(">", next + 1);
192 }
193 rawData = this._buffer.substring(current, next); //The next chunk of data to parse
194
195 //set elements for next run
196 current = next + 1;
197
198 if(this._contentFlags >= SpecialTags[ElementType.CDATA]){
199 // We're inside a CDATA section
200 this._writeCDATA(rawData);
201
202 }
203 else if(this._contentFlags >= SpecialTags[ElementType.Comment]){
204 //We're in a comment tag
205 this._writeComment(rawData);
206 }
207 else if(lastTagSep === "<"){
208 elementData = rawData.trimLeft();
209 if(elementData.charAt(0) === "/"){
210 //elementData = elementData.substr(1).trim();
211 elementData = this._parseTagName(elementData.substr(1));
212 if(this._contentFlags !== 0){
213 //if it's a closing tag, remove the flag
214 if(this._contentFlags & TagValues[elementData]){
215 //remove the flag
216 this._contentFlags ^= TagValues[elementData];
217 } else {
218 this._writeSpecial(rawData, lastTagSep);
219 continue;
220 }
221 }
222 this._processCloseTag(elementData);
223 }
224 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
225 else if(elementData.charAt(0) === "!"){
226 if(elementData.substr(1, 7) === "[CDATA["){
227 this._contentFlags |= SpecialTags[ElementType.CDATA];
228 if(this._cbs.oncdatastart) this._cbs.oncdatastart();
229 this._writeCDATA(elementData.substr(8));
230 }
231 else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep);
232 else if(elementData.substr(1, 2) === "--"){
233 //This tag is a comment
234 this._contentFlags |= SpecialTags[ElementType.Comment];
235 this._writeComment(rawData.substr(3));
236 }
237 //TODO: This isn't a processing instruction, needs a new name
238 else if(this._cbs.onprocessinginstruction){
239 this._cbs.onprocessinginstruction(
240 "!" + this._parseTagName(elementData.substr(1)),
241 elementData
242 );
243 }
244 }
245 else if(elementData.charAt(0) === "?"){
246 if(this._cbs.onprocessinginstruction){
247 this._cbs.onprocessinginstruction(
248 "?" + this._parseTagName(elementData.substr(1)),
249 elementData
250 );
251 }
252 }
253 else this._processOpenTag(elementData);
254 }
255 else{
256 if(this._contentFlags !== 0){
257 this._writeSpecial(rawData, ">");
258 }
259 else if(this._cbs.ontext){
260 if(this._tagSep === ">") rawData += ">"; //it's the second > in a row
261 if(rawData !== "") this._cbs.ontext(rawData);
262 }
263 }
264 }
265
266 this._buffer = this._buffer.substr(current);
267};
268
269Parser.prototype._writeCDATA = function(data){
270 if(this._tagSep === ">" && data.substr(-2) === "]]"){
271 // CDATA ends
272 if(data.length !== 2 && this._cbs.ontext){
273 this._cbs.ontext(data.slice(0,-2));
274 }
275 this._contentFlags ^= SpecialTags[ElementType.CDATA];
276 if(this._cbs.oncdataend) this._cbs.oncdataend();
277 this._wroteSpecial = false;
278 }
279 else if(this._cbs.ontext) this._cbs.ontext(data + this._tagSep);
280};
281
282Parser.prototype._writeComment = function(rawData){
283 if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends
284 //remove the written flag (also removes the comment flag)
285 this._contentFlags ^= SpecialTags[ElementType.Comment];
286 this._wroteSpecial = false;
287 if(this._cbs.oncomment) this._cbs.oncomment(rawData.slice(0, -2));
288 if(this._cbs.oncommentend) this._cbs.oncommentend();
289 }
290 else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep);
291};
292
293Parser.prototype._writeSpecial = function(rawData, lastTagSep){
294 //if the previous element is text, append the last tag sep to element
295 if(this._wroteSpecial){
296 if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData);
297 }
298 else{ //The previous element was not text
299 this._wroteSpecial = true;
300 if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
301 }
302};
303
304var emptyTags = {
305 __proto__: null,
306 area: true,
307 base: true,
308 basefont: true,
309 br: true,
310 col: true,
311 frame: true,
312 hr: true,
313 img: true,
314 input: true,
315 isindex: true,
316 link: true,
317 meta: true,
318 param: true,
319 embed: true
320};
321
322Parser.prototype._processCloseTag = function(name){
323 if(this._stack && (!(name in emptyTags) || this._options.xmlMode)){
324 var pos = this._stack.lastIndexOf(name);
325 if(pos !== -1)
326 if(this._cbs.onclosetag){
327 pos = this._stack.length - pos;
328 while(pos--) this._cbs.onclosetag(this._stack.pop());
329 }
330 else this._stack.splice(pos);
331 }
332 //many browsers (eg. Safari, Chrome) convert </br> to <br>
333 else if(name === "br" && !this._options.xmlMode){
334 this._processOpenTag(name + "/");
335 }
336};
337
338Parser.prototype._parseAttributes = function(data, lcNames){
339 for(var match; match = _reAttrib.exec(data);){
340 this._cbs.onattribute(lcNames ? match[1].toLowerCase() : match[1], match[2] || match[3] || match[4] || "");
341 }
342};
343
344//parses the attribute string
345var parseAttributes = function(data, lcNames){
346 var attrs = {};
347 for(var match; match = _reAttrib.exec(data);){
348 attrs[lcNames ? match[1].toLowerCase() : match[1]] = match[2] || match[3] || match[4] || "";
349 }
350 return attrs;
351};
352
353Parser.prototype._processOpenTag = function(data){
354 var name = this._parseTagName(data),
355 attributes = parseAttributes(data, this._options.lowerCaseAttributeNames),
356 type = ElementType.Tag;
357
358 if(this._options.xmlMode){ /*do nothing*/ }
359 else if(name === "script") type = ElementType.Script;
360 else if(name === "style") type = ElementType.Style;
361 if (!this._options.xmlMode && name in openImpliesClose) {
362 var el;
363 while ((el = this._stack[this._stack.length-1]) in openImpliesClose[name]) {
364 this._processCloseTag(el);
365 }
366 }
367 if(this._cbs.onopentagname) this._cbs.onopentagname(name);
368 if(this._cbs.onopentag) this._cbs.onopentag(name, attributes);
369 if(this._cbs.onattribute){
370 this._parseAttributes(data, this._options.lowerCaseAttributeNames);
371 }
372
373 //If tag self-terminates, add an explicit, separate closing tag
374 /* http://dev.w3.org/html5/html-author/#tags
375 * In XHTML, self-closing tags are valid but attribute values must be quoted.
376 * In HTML, self-closing tags must be either void elements or foreign elements.
377 * Invalid HTML self-closing tag syntax is ignored (treated as an opening tag).
378 * Foreign elements use XML rules
379 */
380 if((!this._options.xmlMode && name in emptyTags) || (data.substr(-1) === "/" && data.replace(_reAttrib, "").substr(-1) === "/")){
381 if(this._cbs.onclosetag) this._cbs.onclosetag(name);
382 } else {
383 if(type !== ElementType.Tag){
384 this._contentFlags |= SpecialTags[type];
385 this._wroteSpecial = false;
386 }
387 this._stack.push(name);
388 }
389};
390
391Parser.prototype._handleError = function(error){
392 error = new Error(error);
393 if(this._cbs.onerror) this._cbs.onerror(error);
394 else throw error;
395};
396
397module.exports = Parser;