UNPKG

3.31 kBJavaScriptView Raw
1import {Transform, ingest} from 'vega-dataflow';
2import {inherits} from 'vega-util';
3
4/**
5 * Count regexp-defined pattern occurrences in a text field.
6 * @constructor
7 * @param {object} params - The parameters for this operator.
8 * @param {function(object): *} params.field - An accessor for the text field.
9 * @param {string} [params.pattern] - RegExp string defining the text pattern.
10 * @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.
11 * @param {string} [params.stopwords] - RegExp string of words to ignore.
12 */
13export default function CountPattern(params) {
14 Transform.call(this, null, params);
15}
16
17CountPattern.Definition = {
18 'type': 'CountPattern',
19 'metadata': {'generates': true, 'changes': true},
20 'params': [
21 { 'name': 'field', 'type': 'field', 'required': true },
22 { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },
23 { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },
24 { 'name': 'stopwords', 'type': 'string', 'default': '' },
25 { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }
26 ]
27};
28
29function tokenize(text, tcase, match) {
30 switch (tcase) {
31 case 'upper': text = text.toUpperCase(); break;
32 case 'lower': text = text.toLowerCase(); break;
33 }
34 return text.match(match);
35}
36
37var prototype = inherits(CountPattern, Transform);
38
39prototype.transform = function(_, pulse) {
40 function process(update) {
41 return function(tuple) {
42 var tokens = tokenize(get(tuple), _.case, match) || [], t;
43 for (var i=0, n=tokens.length; i<n; ++i) {
44 if (!stop.test(t = tokens[i])) update(t);
45 }
46 };
47 }
48
49 var init = this._parameterCheck(_, pulse),
50 counts = this._counts,
51 match = this._match,
52 stop = this._stop,
53 get = _.field,
54 as = _.as || ['text', 'count'],
55 add = process(function(t) { counts[t] = 1 + (counts[t] || 0); }),
56 rem = process(function(t) { counts[t] -= 1; });
57
58 if (init) {
59 pulse.visit(pulse.SOURCE, add);
60 } else {
61 pulse.visit(pulse.ADD, add);
62 pulse.visit(pulse.REM, rem);
63 }
64
65 return this._finish(pulse, as); // generate output tuples
66};
67
68prototype._parameterCheck = function(_, pulse) {
69 var init = false;
70
71 if (_.modified('stopwords') || !this._stop) {
72 this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
73 init = true;
74 }
75
76 if (_.modified('pattern') || !this._match) {
77 this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
78 init = true;
79 }
80
81 if (_.modified('field') || pulse.modified(_.field.fields)) {
82 init = true;
83 }
84
85 if (init) this._counts = {};
86 return init;
87};
88
89prototype._finish = function(pulse, as) {
90 var counts = this._counts,
91 tuples = this._tuples || (this._tuples = {}),
92 text = as[0],
93 count = as[1],
94 out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS),
95 w, t, c;
96
97 for (w in counts) {
98 t = tuples[w];
99 c = counts[w] || 0;
100 if (!t && c) {
101 tuples[w] = (t = ingest({}));
102 t[text] = w;
103 t[count] = c;
104 out.add.push(t);
105 } else if (c === 0) {
106 if (t) out.rem.push(t);
107 counts[w] = null;
108 tuples[w] = null;
109 } else if (t[count] !== c) {
110 t[count] = c;
111 out.mod.push(t);
112 }
113 }
114
115 return out.modifies(as);
116};