1 | // Copyright 2004 Erik Arvidsson. All Rights Reserved.
|
2 | //
|
3 | // This code is triple licensed using Apache Software License 2.0,
|
4 | // Mozilla Public License or GNU Public License
|
5 | //
|
6 | ///////////////////////////////////////////////////////////////////////////////
|
7 | //
|
8 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
9 | // use this file except in compliance with the License. You may obtain a copy
|
10 | // of the License at http://www.apache.org/licenses/LICENSE-2.0
|
11 | //
|
12 | ///////////////////////////////////////////////////////////////////////////////
|
13 | //
|
14 | // The contents of this file are subject to the Mozilla Public License
|
15 | // Version 1.1 (the "License"); you may not use this file except in
|
16 | // compliance with the License. You may obtain a copy of the License at
|
17 | // http://www.mozilla.org/MPL/
|
18 | //
|
19 | // Software distributed under the License is distributed on an "AS IS"
|
20 | // basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
21 | // License for the specific language governing rights and limitations
|
22 | // under the License.
|
23 | //
|
24 | // The Original Code is Simple HTML Parser.
|
25 | //
|
26 | // The Initial Developer of the Original Code is Erik Arvidsson.
|
27 | // Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights
|
28 | // Reserved.
|
29 | //
|
30 | ///////////////////////////////////////////////////////////////////////////////
|
31 | //
|
32 | // This program is free software; you can redistribute it and/or
|
33 | // modify it under the terms of the GNU General Public License
|
34 | // as published by the Free Software Foundation; either version 2
|
35 | // of the License, or (at your option) any later version.
|
36 | //
|
37 | // This program is distributed in the hope that it will be useful,
|
38 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
39 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
40 | // GNU General Public License for more details.
|
41 | //
|
42 | // You should have received a copy of the GNU General Public License
|
43 | // along with this program; if not, write to the Free Software
|
44 | // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
45 | //
|
46 | ///////////////////////////////////////////////////////////////////////////////
|
47 |
|
48 | /*
|
49 | var handler ={
|
50 | startElement: function (sTagName, oAttrs) {},
|
51 | endElement: function (sTagName) {},
|
52 | characters: function (s) {},
|
53 | comment: function (s) {}
|
54 | };
|
55 | */
|
56 |
|
57 | function SimpleHtmlParser() {}
|
58 |
|
59 | SimpleHtmlParser.prototype = {
|
60 | handler: null,
|
61 |
|
62 | // regexps
|
63 |
|
64 | startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^"]*\")|(\'[^']*\')|[^>\s]+))?)*)\s*\/?\s*>/m,
|
65 | endTagRe: /^<\/([^>\s]+)[^>]*>/m,
|
66 | attrRe: /([^=\s]+)(\s*=\s*((\"([^"]*)\")|(\'([^']*)\')|[^>\s]+))?/gm,
|
67 |
|
68 | parse: function (s, oHandler) {
|
69 | if (oHandler) this.contentHandler = oHandler
|
70 |
|
71 | var i = 0
|
72 | var res, lc, lm, rc, index
|
73 | var treatAsChars = false
|
74 | var oThis = this
|
75 | while (s.length > 0) {
|
76 | // Comment
|
77 | if (s.substring(0, 4) == '<!--') {
|
78 | index = s.indexOf('-->')
|
79 | if (index != -1) {
|
80 | this.contentHandler.comment(s.substring(4, index))
|
81 | s = s.substring(index + 3)
|
82 | treatAsChars = false
|
83 | } else {
|
84 | treatAsChars = true
|
85 | }
|
86 | }
|
87 |
|
88 | // end tag
|
89 | else if (s.substring(0, 2) == '</') {
|
90 | if (this.endTagRe.test(s)) {
|
91 | lc = RegExp.leftContext
|
92 | lm = RegExp.lastMatch
|
93 | rc = RegExp.rightContext
|
94 |
|
95 | lm.replace(this.endTagRe, function () {
|
96 | return oThis.parseEndTag.apply(oThis, arguments)
|
97 | })
|
98 |
|
99 | s = rc
|
100 | treatAsChars = false
|
101 | } else {
|
102 | treatAsChars = true
|
103 | }
|
104 | }
|
105 | // start tag
|
106 | else if (s.charAt(0) == '<') {
|
107 | if (this.startTagRe.test(s)) {
|
108 | lc = RegExp.leftContext
|
109 | lm = RegExp.lastMatch
|
110 | rc = RegExp.rightContext
|
111 |
|
112 | lm.replace(this.startTagRe, function () {
|
113 | return oThis.parseStartTag.apply(oThis, arguments)
|
114 | })
|
115 |
|
116 | s = rc
|
117 | treatAsChars = false
|
118 | } else {
|
119 | treatAsChars = true
|
120 | }
|
121 | }
|
122 |
|
123 | if (treatAsChars) {
|
124 | index = s.indexOf('<')
|
125 | if (index == -1) {
|
126 | this.contentHandler.characters(s)
|
127 | s = ''
|
128 | } else {
|
129 | this.contentHandler.characters(s.substring(0, index))
|
130 | s = s.substring(index)
|
131 | }
|
132 | }
|
133 |
|
134 | treatAsChars = true
|
135 | }
|
136 | },
|
137 |
|
138 | parseStartTag: function (sTag, sTagName, sRest) {
|
139 | var attrs = this.parseAttributes(sTagName, sRest)
|
140 | this.contentHandler.startElement(sTagName, attrs)
|
141 | },
|
142 |
|
143 | parseEndTag: function (sTag, sTagName) {
|
144 | this.contentHandler.endElement(sTagName)
|
145 | },
|
146 |
|
147 | parseAttributes: function (sTagName, s) {
|
148 | var oThis = this
|
149 | var attrs = []
|
150 | s.replace(this.attrRe, function (a0, a1, a2, a3, a4, a5, a6) {
|
151 | attrs.push(oThis.parseAttribute(sTagName, a0, a1, a2, a3, a4, a5, a6))
|
152 | })
|
153 | return attrs
|
154 | },
|
155 |
|
156 | parseAttribute: function (sTagName, sAttribute, sName) {
|
157 | var value = ''
|
158 | if (arguments[7]) value = arguments[8]
|
159 | else if (arguments[5]) value = arguments[6]
|
160 | else if (arguments[3]) value = arguments[4]
|
161 |
|
162 | var empty = !value && !arguments[3]
|
163 | return { name: sName, value: empty ? null : value }
|
164 | },
|
165 | }
|
166 |
|
167 | // export default SimpleHtmlParser
|
168 | module.exports = SimpleHtmlParser
|