| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147 |
1×
1×
1×
1×
1×
1×
3744×
3744×
3570×
3744×
1×
174×
174×
174×
792×
792×
792×
792×
792×
174×
174×
174×
1×
174×
174×
174×
174×
174×
174×
174×
1×
174×
174×
174×
174×
174×
1×
1×
| /**
* @file parse code
* @author nighca<nighca@live.cn>
*/
var htmlparser2 = require('htmlparser2');
var Parser = htmlparser2.Parser;
var DomHandler = htmlparser2.DomHandler;
var util = require('./util');
var Node = require('./node');
/**
* Transform node to Node instance & recursively transform its children.
*
* @param {Object} node - given node
* @return {Node} result node
*/
var transformRecursively = function (node) {
Node.init(node);
node.childNodes.forEach(function (childNode) {
transformRecursively(childNode);
});
return node;
};
/**
* Wrap node list with <document>.
*
* @param {Array} arr - node list
* @return {Node} document node
*/
var wrapDocument = function (arr) {
var document = htmlparser2.parseDOM('<document></document>')[0];
document.children = arr;
for (var i = 0; i < arr.length; i++) {
var node = arr[i];
node.prev = arr[i - 1] || null;
node.next = arr[i + 1] || null;
node.root = document;
node.parent = null;
}
transformRecursively(document);
// fix startIndex missing, cause <document> is parsed seperately
document.startIndex = document.documentElement && document.documentElement.startIndex | 0;
return document;
};
/**
* Get a HTML parser.
*
* @param {Object} options - options for create parser
* @return {Parser} HTML parser
*/
var getParser = function (options) {
// merge with default options
options = util.extend({
lowerCaseAttributeNames: false,
recognizeCDATA: true
}, options);
// init handler
var handler = new DomHandler({
withStartIndices: true
});
// init parser
var parser = new Parser(handler, options);
// make handler accessible
parser.handler = handler;
// make tokenizer emittable & accessible
parser.tokenizer = util.emittable(parser._tokenizer, [
'attribdata',
'opentagname',
'opentagend',
'selfclosingtag',
'attribname',
'attribend',
'closetag',
'declaration',
'processinginstruction',
'comment',
'cdata',
'text',
'error',
'end'
]);
// make parser emittable
parser = util.emittable(parser, [
'processinginstruction',
'comment',
'commentend',
'cdatastart',
'text',
'cdataend',
'error',
'closetag',
'end',
'reset',
'parserinit',
'opentagname',
'opentag',
'attribute'
]);
return parser;
};
/**
* Parse given html content to document node.
*
* @param {string} htmlCode - HTML code content
* @param {Parser=} parser - given parser
* @return {Node} document node
*/
var parse = function (htmlCode, parser) {
// get parser
parser = parser || getParser();
// replace "\r\n" with "\n"
htmlCode = htmlCode.replace(/\r\n/g, '\n');
// do parse
parser.end(htmlCode);
// get dom & wrap it with <document>
var document = wrapDocument(parser.handler.dom);
return document;
};
parse.getParser = getParser;
module.exports = parse;
|