1 | import {Transform, ingest} from 'vega-dataflow';
|
2 | import {inherits} from 'vega-util';
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 | export default function CountPattern(params) {
|
14 | Transform.call(this, null, params);
|
15 | }
|
16 |
|
17 | CountPattern.Definition = {
|
18 | 'type': 'CountPattern',
|
19 | 'metadata': {'generates': true, 'changes': true},
|
20 | 'params': [
|
21 | { 'name': 'field', 'type': 'field', 'required': true },
|
22 | { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },
|
23 | { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },
|
24 | { 'name': 'stopwords', 'type': 'string', 'default': '' },
|
25 | { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }
|
26 | ]
|
27 | };
|
28 |
|
29 | function tokenize(text, tcase, match) {
|
30 | switch (tcase) {
|
31 | case 'upper': text = text.toUpperCase(); break;
|
32 | case 'lower': text = text.toLowerCase(); break;
|
33 | }
|
34 | return text.match(match);
|
35 | }
|
36 |
|
37 | var prototype = inherits(CountPattern, Transform);
|
38 |
|
39 | prototype.transform = function(_, pulse) {
|
40 | function process(update) {
|
41 | return function(tuple) {
|
42 | var tokens = tokenize(get(tuple), _.case, match) || [], t;
|
43 | for (var i=0, n=tokens.length; i<n; ++i) {
|
44 | if (!stop.test(t = tokens[i])) update(t);
|
45 | }
|
46 | };
|
47 | }
|
48 |
|
49 | var init = this._parameterCheck(_, pulse),
|
50 | counts = this._counts,
|
51 | match = this._match,
|
52 | stop = this._stop,
|
53 | get = _.field,
|
54 | as = _.as || ['text', 'count'],
|
55 | add = process(function(t) { counts[t] = 1 + (counts[t] || 0); }),
|
56 | rem = process(function(t) { counts[t] -= 1; });
|
57 |
|
58 | if (init) {
|
59 | pulse.visit(pulse.SOURCE, add);
|
60 | } else {
|
61 | pulse.visit(pulse.ADD, add);
|
62 | pulse.visit(pulse.REM, rem);
|
63 | }
|
64 |
|
65 | return this._finish(pulse, as);
|
66 | };
|
67 |
|
68 | prototype._parameterCheck = function(_, pulse) {
|
69 | var init = false;
|
70 |
|
71 | if (_.modified('stopwords') || !this._stop) {
|
72 | this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
|
73 | init = true;
|
74 | }
|
75 |
|
76 | if (_.modified('pattern') || !this._match) {
|
77 | this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
|
78 | init = true;
|
79 | }
|
80 |
|
81 | if (_.modified('field') || pulse.modified(_.field.fields)) {
|
82 | init = true;
|
83 | }
|
84 |
|
85 | if (init) this._counts = {};
|
86 | return init;
|
87 | };
|
88 |
|
89 | prototype._finish = function(pulse, as) {
|
90 | var counts = this._counts,
|
91 | tuples = this._tuples || (this._tuples = {}),
|
92 | text = as[0],
|
93 | count = as[1],
|
94 | out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS),
|
95 | w, t, c;
|
96 |
|
97 | for (w in counts) {
|
98 | t = tuples[w];
|
99 | c = counts[w] || 0;
|
100 | if (!t && c) {
|
101 | tuples[w] = (t = ingest({}));
|
102 | t[text] = w;
|
103 | t[count] = c;
|
104 | out.add.push(t);
|
105 | } else if (c === 0) {
|
106 | if (t) out.rem.push(t);
|
107 | counts[w] = null;
|
108 | tuples[w] = null;
|
109 | } else if (t[count] !== c) {
|
110 | t[count] = c;
|
111 | out.mod.push(t);
|
112 | }
|
113 | }
|
114 |
|
115 | return out.modifies(as);
|
116 | };
|