UNPKG

3.33 kBJavaScriptView Raw
1import {Transform, ingest} from 'vega-dataflow';
2import {inherits} from 'vega-util';
3
4/**
5 * Count regexp-defined pattern occurrences in a text field.
6 * @constructor
7 * @param {object} params - The parameters for this operator.
8 * @param {function(object): *} params.field - An accessor for the text field.
9 * @param {string} [params.pattern] - RegExp string defining the text pattern.
10 * @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.
11 * @param {string} [params.stopwords] - RegExp string of words to ignore.
12 */
13export default function CountPattern(params) {
14 Transform.call(this, null, params);
15}
16
17CountPattern.Definition = {
18 'type': 'CountPattern',
19 'metadata': {'generates': true, 'changes': true},
20 'params': [
21 { 'name': 'field', 'type': 'field', 'required': true },
22 { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },
23 { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },
24 { 'name': 'stopwords', 'type': 'string', 'default': '' },
25 { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }
26 ]
27};
28
29function tokenize(text, tcase, match) {
30 switch (tcase) {
31 case 'upper': text = text.toUpperCase(); break;
32 case 'lower': text = text.toLowerCase(); break;
33 }
34 return text.match(match);
35}
36
37inherits(CountPattern, Transform, {
38 transform(_, pulse) {
39 const process = update => tuple => {
40 var tokens = tokenize(get(tuple), _.case, match) || [], t;
41 for (var i=0, n=tokens.length; i<n; ++i) {
42 if (!stop.test(t = tokens[i])) update(t);
43 }
44 };
45
46 const init = this._parameterCheck(_, pulse),
47 counts = this._counts,
48 match = this._match,
49 stop = this._stop,
50 get = _.field,
51 as = _.as || ['text', 'count'],
52 add = process(t => counts[t] = 1 + (counts[t] || 0)),
53 rem = process(t => counts[t] -= 1);
54
55 if (init) {
56 pulse.visit(pulse.SOURCE, add);
57 } else {
58 pulse.visit(pulse.ADD, add);
59 pulse.visit(pulse.REM, rem);
60 }
61
62 return this._finish(pulse, as); // generate output tuples
63 },
64
65 _parameterCheck(_, pulse) {
66 let init = false;
67
68 if (_.modified('stopwords') || !this._stop) {
69 this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
70 init = true;
71 }
72
73 if (_.modified('pattern') || !this._match) {
74 this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
75 init = true;
76 }
77
78 if (_.modified('field') || pulse.modified(_.field.fields)) {
79 init = true;
80 }
81
82 if (init) this._counts = {};
83 return init;
84 },
85
86 _finish(pulse, as) {
87 const counts = this._counts,
88 tuples = this._tuples || (this._tuples = {}),
89 text = as[0],
90 count = as[1],
91 out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS);
92 let w, t, c;
93
94 for (w in counts) {
95 t = tuples[w];
96 c = counts[w] || 0;
97 if (!t && c) {
98 tuples[w] = (t = ingest({}));
99 t[text] = w;
100 t[count] = c;
101 out.add.push(t);
102 } else if (c === 0) {
103 if (t) out.rem.push(t);
104 counts[w] = null;
105 tuples[w] = null;
106 } else if (t[count] !== c) {
107 t[count] = c;
108 out.mod.push(t);
109 }
110 }
111
112 return out.modifies(as);
113 }
114});