1 | 'use strict'
|
2 |
|
3 |
|
4 |
|
5 | exports = module.exports = Pickup
|
6 |
|
7 | const StringDecoder = require('string_decoder').StringDecoder
|
8 | const attribute = require('./lib/attribute')
|
9 | const debug = require('util').debuglog('pickup')
|
10 | const mappings = require('./lib/mappings')
|
11 | const os = require('os')
|
12 | const sax = require('sax')
|
13 | const stream = require('readable-stream')
|
14 | const util = require('util')
|
15 |
|
16 | function OpenHandlers (t) {
|
17 | this.channel = t.feedopen
|
18 | this.feed = t.feedopen
|
19 | this.item = t.entryopen
|
20 | this.entry = t.entryopen
|
21 | this.image = t.imageopen
|
22 | }
|
23 |
|
24 | function CloseHandlers (t) {
|
25 | this.channel = t.feedclose
|
26 | this.feed = t.feedclose
|
27 | this.item = t.entryclose
|
28 | this.entry = t.entryclose
|
29 | this.image = t.imageclose
|
30 | }
|
31 |
|
32 | function Opts (trim, normalize, position) {
|
33 | this.trim = trim
|
34 | this.normalize = normalize
|
35 | this.position = position
|
36 | }
|
37 |
|
38 | function encodingFromString (str) {
|
39 | if (str.match(/utf-8/i)) {
|
40 | return 'utf8'
|
41 | } else if (str.match(/iso-8859-1/i)) {
|
42 | return 'binary'
|
43 | }
|
44 | return 'utf8'
|
45 | }
|
46 |
|
47 | function encodingFromOpts (opts) {
|
48 | const str = opts ? opts.charset : null
|
49 | if (typeof str !== 'string') return null
|
50 | return encodingFromString(str)
|
51 | }
|
52 |
|
53 | const saxOpts = new Opts(true, true, false)
|
54 |
|
55 | util.inherits(Pickup, stream.Transform)
|
56 | function Pickup (opts) {
|
57 | if (!(this instanceof Pickup)) return new Pickup(opts)
|
58 | stream.Transform.call(this, opts)
|
59 |
|
60 | if (!Pickup.openHandlers) {
|
61 | Pickup.openHandlers = new OpenHandlers(Pickup.prototype)
|
62 | Pickup.closeHandlers = new CloseHandlers(Pickup.prototype)
|
63 | }
|
64 |
|
65 | this.encoding = encodingFromOpts(opts)
|
66 | this.decoder = new StringDecoder(this.encoding)
|
67 |
|
68 | this.eventMode = opts && opts.eventMode
|
69 | this.map = null
|
70 | this.parser = sax.parser(true, saxOpts)
|
71 | this.state = new State()
|
72 |
|
73 | const parser = this.parser
|
74 |
|
75 | parser.ontext = (t) => {
|
76 | const current = this.current()
|
77 | const map = this.map
|
78 | const state = this.state
|
79 | const name = this.state.name
|
80 |
|
81 | if (!current || !map) return
|
82 |
|
83 | let key = map.get(name)
|
84 | if (key === undefined) return
|
85 |
|
86 | if (state.image && name === 'url') key = 'image'
|
87 |
|
88 | const isSet = current[key] !== undefined
|
89 |
|
90 |
|
91 |
|
92 | if (isSet) {
|
93 | const shouldOverride = () => {
|
94 | if (key === 'summary') {
|
95 | return name === 'content:encoded' && t.length < 4096
|
96 | }
|
97 | return false
|
98 | }
|
99 |
|
100 | if (!shouldOverride()) {
|
101 | return
|
102 | }
|
103 |
|
104 | debug('overriding %s with %s', key, name)
|
105 | }
|
106 |
|
107 | current[key] = t
|
108 | }
|
109 |
|
110 | parser.oncdata = (d) => {
|
111 | parser.ontext(d)
|
112 | }
|
113 |
|
114 | const handle = (name, handlers) => {
|
115 | if (handlers.hasOwnProperty(name)) {
|
116 | handlers[name].apply(this)
|
117 | }
|
118 | }
|
119 | parser.onopentag = (node) => {
|
120 | const name = node.name
|
121 | this.state.name = name
|
122 | this.map = mappings[name] || this.map
|
123 | handle(name, Pickup.openHandlers)
|
124 | const current = this.current()
|
125 | if (current) {
|
126 | const key = this.map.get(name)
|
127 | if (key) {
|
128 | const attributes = node.attributes
|
129 | const keys = Object.keys(attributes)
|
130 | if (keys.length) {
|
131 | const kv = attribute(key, attributes, current)
|
132 | if (kv) {
|
133 | current[kv[0]] = kv[1]
|
134 | }
|
135 | }
|
136 | }
|
137 | }
|
138 | }
|
139 |
|
140 | parser.onclosetag = (name) => {
|
141 | handle(name, Pickup.closeHandlers)
|
142 | this.state.name = null
|
143 | }
|
144 | }
|
145 |
|
146 | Pickup.prototype.current = function () {
|
147 | return this.state.entry || this.state.feed
|
148 | }
|
149 |
|
150 | Pickup.prototype.objectMode = function () {
|
151 | return this._readableState.objectMode
|
152 | }
|
153 |
|
154 | Pickup.prototype.feedopen = function () {
|
155 | const feed = this.state.feed
|
156 | if (feed) { debug('nested feed: ', feed) }
|
157 | this.state.feed = new Feed()
|
158 | }
|
159 |
|
160 | Pickup.prototype.entryopen = function () {
|
161 | const entry = this.state.entry
|
162 | if (entry) { debug('nested entry: ', entry) }
|
163 | this.state.entry = new Entry()
|
164 | }
|
165 |
|
166 | Pickup.prototype.imageopen = function () {
|
167 | this.state.image = true
|
168 | }
|
169 |
|
170 | Pickup.prototype.entryclose = function () {
|
171 | const entry = this.state.entry
|
172 | if (!entry) { return }
|
173 |
|
174 | if (!this.eventMode) {
|
175 | if (this.objectMode()) {
|
176 | this.push(entry)
|
177 | } else {
|
178 | this.push(JSON.stringify(entry) + os.EOL)
|
179 | }
|
180 | } else {
|
181 | this.emit('entry', entry)
|
182 | }
|
183 | this.state.entry = null
|
184 | }
|
185 |
|
186 | Pickup.prototype.feedclose = function () {
|
187 | const feed = this.state.feed
|
188 | if (!feed) { return }
|
189 |
|
190 | if (!this.eventMode) {
|
191 | if (this.objectMode()) {
|
192 | this.push(feed)
|
193 | } else {
|
194 | this.push(JSON.stringify(feed) + os.EOL)
|
195 | }
|
196 | } else {
|
197 | this.emit('feed', feed)
|
198 | }
|
199 | this.state.feed = null
|
200 | }
|
201 |
|
202 | Pickup.prototype.imageclose = function () {
|
203 | this.state.image = false
|
204 | }
|
205 |
|
206 | function free (parser) {
|
207 | parser.oncdata = null
|
208 | parser.onclosetag = null
|
209 | parser.onopentag = null
|
210 | parser.ontext = null
|
211 | }
|
212 |
|
213 | Pickup.prototype._flush = function (cb) {
|
214 | free(this.parser)
|
215 | this.parser.close()
|
216 |
|
217 | this._decoder = null
|
218 |
|
219 | this.encoding = null
|
220 | this.map = null
|
221 | this.parser = null
|
222 |
|
223 | this.state.deinit()
|
224 | this.state = null
|
225 |
|
226 | cb()
|
227 | }
|
228 |
|
229 | function cribEncoding (str) {
|
230 | const enc = str.split('encoding')[1]
|
231 | const def = 'utf8'
|
232 | if (!enc) return def
|
233 | if (enc.trim()[0] === '=') {
|
234 | return encodingFromString(enc)
|
235 | }
|
236 | return def
|
237 | }
|
238 |
|
239 | Pickup.prototype._transform = function (chunk, enc, cb) {
|
240 | if (!this._decoder) {
|
241 | if (!this.encoding) {
|
242 |
|
243 |
|
244 | const t = chunk.toString('ascii', 0, 128)
|
245 | this.encoding = cribEncoding(t)
|
246 | }
|
247 | this.emit('encoding', this.encoding)
|
248 | }
|
249 | const str = this.decoder.write(chunk)
|
250 | const er = this.parser.write(str).error
|
251 | this.parser.error = null
|
252 | cb(er)
|
253 | }
|
254 |
|
255 | function Entry (
|
256 | author
|
257 | , duration
|
258 | , enclosure
|
259 | , id
|
260 | , image
|
261 | , link
|
262 | , originalURL
|
263 | , subtitle
|
264 | , summary
|
265 | , title
|
266 | , updated
|
267 | , url) {
|
268 | this.author = author
|
269 | this.duration = duration
|
270 | this.enclosure = enclosure
|
271 | this.feed = feed
|
272 | this.id = id
|
273 | this.image = image
|
274 | this.link = link
|
275 | this.originalURL = originalURL
|
276 | this.subtitle = subtitle
|
277 | this.summary = summary
|
278 | this.title = title
|
279 | this.updated = updated
|
280 | this.url = url
|
281 | }
|
282 |
|
283 | function Feed (
|
284 | author
|
285 | , copyright
|
286 | , id
|
287 | , image
|
288 | , language
|
289 | , link
|
290 | , originalURL
|
291 | , payment
|
292 | , subtitle
|
293 | , summary
|
294 | , title
|
295 | , ttl
|
296 | , updated
|
297 | , url) {
|
298 | this.author = author
|
299 | this.copyright = copyright
|
300 | this.id = id
|
301 | this.image = image
|
302 | this.language = language
|
303 | this.link = link
|
304 | this.originalURL = originalURL
|
305 | this.payment = payment
|
306 | this.subtitle = subtitle
|
307 | this.summary = summary
|
308 | this.title = title
|
309 | this.ttl = ttl
|
310 | this.updated = updated
|
311 | this.url = url
|
312 | }
|
313 |
|
314 | function State (entry, feed, image, name) {
|
315 | this.entry = entry
|
316 | this.feed = feed
|
317 | this.image = image
|
318 | this.name = name
|
319 | }
|
320 |
|
321 | State.prototype.deinit = function () {
|
322 | this.entry = null
|
323 | this.feed = null
|
324 | this.image = false
|
325 | this.name = undefined
|
326 | }
|
327 |
|
328 | function extend (origin, add) {
|
329 | return Object.assign(origin, add || Object.create(null))
|
330 | }
|
331 | function entry (obj) {
|
332 | return extend(new Entry(), obj)
|
333 | }
|
334 | function feed (obj) {
|
335 | return extend(new Feed(), obj)
|
336 | }
|
337 |
|
338 | if (process.mainModule.filename.match(/test/) !== null) {
|
339 | exports.cribEncoding = cribEncoding
|
340 | exports.entry = entry
|
341 | exports.feed = feed
|
342 | exports.EVENTS = [
|
343 | 'data',
|
344 | 'drain',
|
345 | 'readable',
|
346 | 'end',
|
347 | 'entry',
|
348 | 'error',
|
349 | 'feed',
|
350 | 'finish'
|
351 | ]
|
352 | }
|