1 | import {Transform, ingest} from 'vega-dataflow';
|
2 | import {inherits} from 'vega-util';
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 | export default function CountPattern(params) {
|
14 | Transform.call(this, null, params);
|
15 | }
|
16 |
|
17 | CountPattern.Definition = {
|
18 | 'type': 'CountPattern',
|
19 | 'metadata': {'generates': true, 'changes': true},
|
20 | 'params': [
|
21 | { 'name': 'field', 'type': 'field', 'required': true },
|
22 | { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },
|
23 | { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },
|
24 | { 'name': 'stopwords', 'type': 'string', 'default': '' },
|
25 | { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }
|
26 | ]
|
27 | };
|
28 |
|
29 | function tokenize(text, tcase, match) {
|
30 | switch (tcase) {
|
31 | case 'upper': text = text.toUpperCase(); break;
|
32 | case 'lower': text = text.toLowerCase(); break;
|
33 | }
|
34 | return text.match(match);
|
35 | }
|
36 |
|
37 | inherits(CountPattern, Transform, {
|
38 | transform(_, pulse) {
|
39 | const process = update => tuple => {
|
40 | var tokens = tokenize(get(tuple), _.case, match) || [], t;
|
41 | for (var i=0, n=tokens.length; i<n; ++i) {
|
42 | if (!stop.test(t = tokens[i])) update(t);
|
43 | }
|
44 | };
|
45 |
|
46 | const init = this._parameterCheck(_, pulse),
|
47 | counts = this._counts,
|
48 | match = this._match,
|
49 | stop = this._stop,
|
50 | get = _.field,
|
51 | as = _.as || ['text', 'count'],
|
52 | add = process(t => counts[t] = 1 + (counts[t] || 0)),
|
53 | rem = process(t => counts[t] -= 1);
|
54 |
|
55 | if (init) {
|
56 | pulse.visit(pulse.SOURCE, add);
|
57 | } else {
|
58 | pulse.visit(pulse.ADD, add);
|
59 | pulse.visit(pulse.REM, rem);
|
60 | }
|
61 |
|
62 | return this._finish(pulse, as);
|
63 | },
|
64 |
|
65 | _parameterCheck(_, pulse) {
|
66 | let init = false;
|
67 |
|
68 | if (_.modified('stopwords') || !this._stop) {
|
69 | this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
|
70 | init = true;
|
71 | }
|
72 |
|
73 | if (_.modified('pattern') || !this._match) {
|
74 | this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
|
75 | init = true;
|
76 | }
|
77 |
|
78 | if (_.modified('field') || pulse.modified(_.field.fields)) {
|
79 | init = true;
|
80 | }
|
81 |
|
82 | if (init) this._counts = {};
|
83 | return init;
|
84 | },
|
85 |
|
86 | _finish(pulse, as) {
|
87 | const counts = this._counts,
|
88 | tuples = this._tuples || (this._tuples = {}),
|
89 | text = as[0],
|
90 | count = as[1],
|
91 | out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS);
|
92 | let w, t, c;
|
93 |
|
94 | for (w in counts) {
|
95 | t = tuples[w];
|
96 | c = counts[w] || 0;
|
97 | if (!t && c) {
|
98 | tuples[w] = (t = ingest({}));
|
99 | t[text] = w;
|
100 | t[count] = c;
|
101 | out.add.push(t);
|
102 | } else if (c === 0) {
|
103 | if (t) out.rem.push(t);
|
104 | counts[w] = null;
|
105 | tuples[w] = null;
|
106 | } else if (t[count] !== c) {
|
107 | t[count] = c;
|
108 | out.mod.push(t);
|
109 | }
|
110 | }
|
111 |
|
112 | return out.modifies(as);
|
113 | }
|
114 | });
|