UNPKG

11.7 kBJavaScriptView Raw
1#!/usr/bin/env node
2'use strict';
3var fs = require('fs');
4var path = require('path');
5var process = require('process');
6var domino = require('../../');
7
8/** Rebuild test/html5lib-tests.json based on the test specifications in
9 * the html5lib-tests submodule.
10 */
11
12var NAMESPACE = {
13 html: 'http://www.w3.org/1999/xhtml',
14 xml: 'http://www.w3.org/XML/1998/namespace',
15 xmlns: 'http://www.w3.org/2000/xmlns/',
16 math: 'http://www.w3.org/1998/Math/MathML',
17 svg: 'http://www.w3.org/2000/svg',
18 xlink: 'http://www.w3.org/1999/xlink'
19};
20// menuitem is no longer EMPTY, see https://github.com/whatwg/html/pull/907
21// This list comes from https://html.spec.whatwg.org/multipage/syntax.html#serialising-html-fragments
22var EMPTY = {
23 area: true,
24 base: true,
25 basefont: true,
26 bgsound: true,
27 br: true,
28 col: true,
29 embed: true,
30 frame: true,
31 hr: true,
32 img: true,
33 input: true,
34 keygen: true,
35 link: true,
36 meta: true,
37 param: true,
38 source: true,
39 track: true,
40 wbr: true
41};
42var EXTRA_NL = {
43 /* Removed in https://github.com/whatwg/html/issues/944
44 pre: true,
45 textarea: true,
46 listing: true
47 */
48};
49var NO_ESCAPE = {
50 style: true, script: true, xmp:true, iframe:true, noembed:true,
51 noframes:true, plaintext:true,
52 noscript: true // <- assumes that scripting is enabled.
53};
54
55var localname = function(namestring) {
56 return namestring.replace(/^(svg|math|xlink|xml|xmlns) /, '');
57};
58var namespace = function(namestring) {
59 var m = /^(svg|math|xlink|xml|xmlns) /.exec(namestring);
60 // Save some space by using 'undefined' to represent the html namespace.
61 return m ? NAMESPACE[m[1]] : undefined/*NAMESPACE.html*/;
62};
63
64var ParseError = function ParseError(desc, filename, input) {
65 Error.call(this);
66 this.name = this.constructor.name;
67 this.message = desc + ' ['+filename+']: ' + JSON.stringify(input);
68};
69ParseError.prototype = Object.create(Error.prototype);
70ParseError.prototype.constructor = ParseError;
71
72
73var list_tests = function() {
74 var base = path.join(__dirname, '..', 'html5lib-tests', 'tree-construction');
75 var testfiles = fs.readdirSync(base).filter(function(filename) {
76 return /\.dat$/.test(filename);
77 }).map(function(f) { return path.normalize(path.join(base, f)); });
78 testfiles.sort();
79 return testfiles;
80};
81
82var parse_test_file = function(filename) {
83 var basename = path.basename(filename, '.dat');
84 var cases = fs.readFileSync(filename, 'utf8').replace(/\n$/,'')
85 .split(/\n\n(?=#data\n)/g);
86 return cases.map(function(c) {
87 return twiddle_test(basename, parse_one_test(basename, c));
88 });
89};
90
91var parse_one_test = function(filename, testcase) {
92 var m = /^#data\n(?:([^]*?)\n)?(?:#script-(on|off)\n)?#errors\n((?:[^\n]*\n)*?)(?:#document-fragment\n([^\n]*)\n)?(?:#script-(on|off)\n)?#document\n([^]*?)$/.exec(testcase+'\n');
93 if (!m) {
94 throw new ParseError("Can't parse test case", filename, testcase);
95 }
96 // According to the README, there should always be at least two newlines
97 // between #data and #errors, but some test cases have only one.
98 // `data` will be null in that case.
99 var fragment = m[4] ? { name: localname(m[4]), ns:namespace(m[4]) } :
100 undefined;
101 return {
102 //file: filename,
103 data: m[1] || '',
104 errors: m[3].split(/\n/g).slice(0,-1),
105 fragment: fragment,
106 script: m[2] || m[5],
107 document: serialize_doc(filename, fragment, m[6])
108 };
109};
110
111// Parse the node tree spec, emitting a serialized output string as well
112// as a JSON representation of the tree.
113var serialize_doc = function(filename, fragment, doc) {
114 var result = "", stack = [], can_add_attr = false, props = {tags:{}};
115 var root = { children: [] }, parent, obj;
116 if (fragment) { root.tag = fragment.name; root.ns = fragment.ns; }
117 var clear_add_attr = function() {
118 if (can_add_attr) {
119 result += '>';
120 can_add_attr = false;
121 }
122 };
123 var pop_stack = function() {
124 clear_add_attr();
125 var old = stack.pop();
126 if (old.content !== true) {
127 if (old.ns===namespace('html') && EMPTY[old.tag]) {
128 if (old.children.length > 0) {
129 throw new ParseError("Empty elements ("+old.tag+") can't have children",
130 filename, doc);
131 }
132 } else {
133 result += '</' + old.tag + '>';
134 }
135 }
136 // save some space in the JSON output by omitting empty lists
137 if (old.children.length===0) { old.children = undefined; }
138 if (old.attrs && old.attrs.length===0) { old.attrs = undefined; }
139 return old;
140 };
141 var stack_top = function() {
142 if (stack.length === 0) { return root; }
143 return stack[stack.length-1];
144 };
145 var escape = function(s) {
146 return s.replace(/[&<>\u00A0]/g, function(c) {
147 switch(c) {
148 case '&': return '&amp;';
149 case '<': return '&lt;';
150 case '>': return '&gt;';
151 case '\u00A0': return '&nbsp;';
152 }
153 });
154 };
155 var escapeAttr = function(s) {
156 return s.replace(/[&"\u00A0]/g, function(c) {
157 switch(c) {
158 case '&': return '&amp;';
159 case '"': return '&quot;';
160 case '\u00A0': return '&nbsp;';
161 }
162 });
163 };
164
165 while (doc.length > 0) {
166 var m = /^\| ((?: )*)(?:<([^!?>][^>]*)>|([^="\n][^=\n]*)="([^"]*)"|"((?:[^"]|"(?!\n))*)"|<!-- ((?:[^](?!-->))*) -->|<!DOCTYPE ([^>]*)>|<\?([^>]+)>|(content))\n/.exec(doc);
167 if (!m) {
168 throw new ParseError('Bad document line', filename, doc);
169 }
170 doc = doc.slice(m[0].length);
171 var indent = m[1].length / 2;
172 while (indent < stack.length) {
173 pop_stack();
174 }
175 if (indent !== stack.length) {
176 throw new ParseError('Indentation error', filename, doc);
177 }
178 var tagname = m[2], attrname = m[3], attrvalue = m[4];
179 var text = m[5], comment = m[6], doctype = m[7], processing = m[8];
180 var template_content = m[9];
181 if (attrname !== undefined) {
182 if (!can_add_attr)
183 throw new ParseError('Late attribute', filename, m);
184 obj = {
185 name:localname(attrname),
186 ns:namespace(attrname),
187 value:attrvalue
188 };
189 if (attrvalue !== escapeAttr(attrvalue)) {
190 obj.escaped = props.escaped = true;
191 }
192 var serializedName;
193 if (obj.ns === namespace('html')) {
194 serializedName = obj.name;
195 } else if (obj.ns === NAMESPACE.xml) {
196 serializedName = 'xml:' + obj.name;
197 } else if (obj.ns == NAMESPACE.xmlns) {
198 if (obj.name === 'xmlns') {
199 serializedName = 'xmlns';
200 } else {
201 serializedName = 'xmlns:' + obj.name;
202 }
203 } else if (obj.ns === NAMESPACE.xlink) {
204 serializedName = 'xlink:' + obj.name;
205 } else {
206 throw new Error("don't know what qualified name to use");
207 }
208 result += ' ' + serializedName + '="' + escapeAttr(obj.value) + '"';
209 stack_top().attrs.push(obj);
210 if (/[<"]/.test(serializedName)) {
211 props.attrWithFunnyChar = true;
212 }
213 continue;
214 }
215 clear_add_attr();
216 if (tagname !== undefined) {
217 result += '<' + localname(tagname);
218 can_add_attr = true;
219 props.tags[tagname] = true;
220 if (/</.test(tagname)) {
221 props.tagWithLt = true;
222 }
223 parent = stack_top();
224 stack.push({
225 tag: localname(tagname),
226 ns: namespace(tagname),
227 attrs: [],
228 children: []
229 });
230 parent.children.push(stack_top());
231 continue;
232 }
233 if (text !== undefined) {
234 obj = { text: text };
235 if (stack_top().ns === namespace('html') &&
236 NO_ESCAPE[stack_top().tag]) {
237 obj.no_escape = props.no_escape = true;
238 }
239 if (stack_top().ns === namespace('html') &&
240 EXTRA_NL[stack_top().tag] &&
241 stack_top().children.length === 0 &&
242 /^\n/.test(text)) {
243 result += '\n';
244 obj.extraNL = props.extraNL = true;
245 }
246 if (text !== escape(text) && !obj.no_escape) {
247 obj.escaped = props.escaped = true;
248 }
249 result += obj.no_escape ? text : escape(text);
250 stack_top().children.push(obj);
251 continue;
252 }
253 if (comment !== undefined) {
254 result += '<!--' + comment + '-->';
255 props.comment = true;
256 stack_top().children.push({ comment: comment });
257 continue;
258 }
259 if (doctype !== undefined) {
260 // HTML serialization spec says just include the name, not the
261 // public or system identifiers.
262 result += '<!DOCTYPE ' + doctype.replace(/ .*$/, '') + '>';
263 props.doctype = true;
264 stack_top().children.push({ doctype: doctype });
265 continue;
266 }
267 if (processing !== undefined) {
268 result += '<?' + processing + '>';
269 props.processing = true;
270 stack_top().children.push({ processing: processing });
271 continue;
272 }
273 if (template_content !== undefined) {
274 parent = stack_top();
275 stack.push({content:true, children:[]});
276 parent.children.push(stack_top());
277 can_add_attr = false;
278 props.template = true;
279 continue;
280 }
281 throw new ParseError("Unknown line type", filename, m);
282 }
283 while (stack.length > 0) {
284 pop_stack();
285 }
286 return {
287 props: props,
288 tree: root.children,
289 html: result
290 };
291};
292
293var twiddle_test = function(filename, tc) {
294 // Adjust the expected HTML serialization for some tests so that
295 // output attribute order always matches input attributes order.
296 var expected = tc.document.html;
297
298 // Tweak the order of attributes:
299 if (/^isindex$/.test(filename) &&
300 /<isindex name="A" action="B" prompt="C" foo="D"/.test(tc.data) &&
301 /<isindex action="B" foo="D" name="A" prompt="C"/.test(expected)) {
302 expected = expected.replace(/<(isindex) (action="B") (foo="D") (name="A") (prompt="C")/,
303 '<$1 $4 $2 $5 $3');
304 }
305 if (/^tests(9|10)$/.test(filename) &&
306 /<(g|mi) xml:lang=en xlink:href=foo/.test(tc.data) &&
307 /<(g|mi) xlink:href="foo" xml:lang="en"/.test(expected)) {
308 expected = expected.replace(/<(g|mi) (xlink[^> ]+) (xml[^> ]+)/g,
309 '<$1 $3 $2');
310 }
311 if (filename==='tests19' &&
312 /<html c=d>.*<html a=b>/.test(tc.data) &&
313 /<html a="b" c="d">/.test(expected)) {
314 expected = expected.replace(/a="b" c="d"/, 'c="d" a="b"');
315 }
316 if (filename==='tests19' &&
317 /http-equiv="content-type" content="[^\"]+"/.test(tc.data) &&
318 /content="[^\"]+" http-equiv="content-type"/.test(expected)) {
319 expected = expected.replace(/(content=[^> ]+) (http-equiv=[^> ]+)/g, '$2 $1');
320 }
321 if (filename==='tests23' &&
322 /size=4 id=a/.test(tc.data) &&
323 /id="a" size="4"/.test(expected)) {
324 expected = expected.replace(/(id=[^> ]+) (size=[^> ]+)/g, '$2 $1');
325 }
326 if (filename==='tests26' &&
327 /<code code="" x<="">/.test(expected)) {
328 expected = expected.replace(/(code=[^> ]+) (x<=[^> ]+)/g, '$2 $1');
329 }
330 if (filename==='webkit01' &&
331 /<rdar: 6869687="" problem="">/.test(expected)) {
332 expected = expected.replace(/(6869687=[^> ]+) (problem=[^> ]+)/g, '$2 $1');
333 }
334 tc.document.html = expected;
335 // Will this pass if parsed as a <body> fragment in no-quirks mode?
336 // This property is used by some third-party consumers of the parsed
337 // tests.
338 var dd = domino.createDocument();
339 dd.body.innerHTML = tc.data;
340 tc.document.noQuirksBodyHtml = dd.body.innerHTML;
341
342 return tc;
343};
344
345var result = list_tests().reduce(function(result, filename){
346 result[path.basename(filename)] = parse_test_file(filename);
347 return result;
348}, {});
349//console.log(JSON.stringify(result, null, 2));
350if (process.argv[2]) {
351 fs.writeFileSync(process.argv[2], JSON.stringify(result, null, 2), 'utf8');
352 console.warn('Wrote', process.argv[2]);
353} else {
354 console.log(JSON.stringify(result, null, 2));
355}