UNPKG

vega-transforms/src/CountPattern.js

Version:

3.33 kBJavaScriptView Raw

1import {Transform, ingest} from 'vega-dataflow';
2import {inherits} from 'vega-util';
3
4/**
* Count regexp-defined pattern occurrences in a text field.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.field - An accessor for the text field.
* @param {string} [params.pattern] - RegExp string defining the text pattern.
* @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.
* @param {string} [params.stopwords] - RegExp string of words to ignore.
*/
13export default function CountPattern(params) {
Transform.call(this, null, params);
15}
16
17CountPattern.Definition = {
'type': 'CountPattern',
'metadata': {'generates': true, 'changes': true},
'params': [
  { 'name': 'field', 'type': 'field', 'required': true },
  { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },
  { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },
  { 'name': 'stopwords', 'type': 'string', 'default': '' },
  { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }
]
27};
28
29function tokenize(text, tcase, match) {
switch (tcase) {
  case 'upper': text = text.toUpperCase(); break;
  case 'lower': text = text.toLowerCase(); break;
}
return text.match(match);
35}
36
37inherits(CountPattern, Transform, {
transform(_, pulse) {
  const process = update => tuple => {
    var tokens = tokenize(get(tuple), _.case, match) || [], t;
    for (var i=0, n=tokens.length; i<n; ++i) {
      if (!stop.test(t = tokens[i])) update(t);
    }
  };
45
  const init = this._parameterCheck(_, pulse),
        counts = this._counts,
        match = this._match,
        stop = this._stop,
        get = _.field,
        as = _.as || ['text', 'count'],
        add = process(t => counts[t] = 1 + (counts[t] || 0)),
        rem = process(t => counts[t] -= 1);
54
  if (init) {
    pulse.visit(pulse.SOURCE, add);
  } else {
    pulse.visit(pulse.ADD, add);
    pulse.visit(pulse.REM, rem);
  }
61
  return this._finish(pulse, as); // generate output tuples
},
64
_parameterCheck(_, pulse) {
  let init = false;
67
  if (_.modified('stopwords') || !this._stop) {
    this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
    init = true;
  }
72
  if (_.modified('pattern') || !this._match) {
    this._match = new RegExp((_.pattern || '[\\w\']+'), 'g');
    init = true;
  }
77
  if (_.modified('field') || pulse.modified(_.field.fields)) {
    init = true;
  }
81
  if (init) this._counts = {};
  return init;
},
85
_finish(pulse, as) {
  const counts = this._counts,
        tuples = this._tuples || (this._tuples = {}),
        text = as[0],
        count = as[1],
        out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS);
  let w, t, c;
93
  for (w in counts) {
    t = tuples[w];
    c = counts[w] || 0;
    if (!t && c) {
      tuples[w] = (t = ingest({}));
      t[text] = w;
      t[count] = c;
      out.add.push(t);
    } else if (c === 0) {
      if (t) out.rem.push(t);
      counts[w] = null;
      tuples[w] = null;
    } else if (t[count] !== c) {
      t[count] = c;
      out.mod.push(t);
    }
  }
111
  return out.modifies(as);
}
114});

1	`import {Transform, ingest} from 'vega-dataflow';`
2	`import {inherits} from 'vega-util';`
3
4	`/**`
5	`* Count regexp-defined pattern occurrences in a text field.`
6	`* @constructor`
7	`* @param {object} params - The parameters for this operator.`
8	`* @param {function(object): *} params.field - An accessor for the text field.`
9	`* @param {string} [params.pattern] - RegExp string defining the text pattern.`
10	`* @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.`
11	`* @param {string} [params.stopwords] - RegExp string of words to ignore.`
12	`*/`
13	`export default function CountPattern(params) {`
14	`Transform.call(this, null, params);`
15	`}`
16
17	`CountPattern.Definition = {`
18	`'type': 'CountPattern',`
19	`'metadata': {'generates': true, 'changes': true},`
20	`'params': [`
21	`{ 'name': 'field', 'type': 'field', 'required': true },`
22	`{ 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' },`
23	`{ 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' },`
24	`{ 'name': 'stopwords', 'type': 'string', 'default': '' },`
25	`{ 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }`
26	`]`
27	`};`
28
29	`function tokenize(text, tcase, match) {`
30	`switch (tcase) {`
31	`case 'upper': text = text.toUpperCase(); break;`
32	`case 'lower': text = text.toLowerCase(); break;`
33	`}`
34	`return text.match(match);`
35	`}`
36
37	`inherits(CountPattern, Transform, {`
38	`transform(_, pulse) {`
39	`const process = update => tuple => {`
40	`var tokens = tokenize(get(tuple), _.case, match) \|\| [], t;`
41	`for (var i=0, n=tokens.length; i<n; ++i) {`
42	`if (!stop.test(t = tokens[i])) update(t);`
43	`}`
44	`};`
45
46	`const init = this._parameterCheck(_, pulse),`
47	`counts = this._counts,`
48	`match = this._match,`
49	`stop = this._stop,`
50	`get = _.field,`
51	`as = _.as \|\| ['text', 'count'],`
52	`add = process(t => counts[t] = 1 + (counts[t] \|\| 0)),`
53	`rem = process(t => counts[t] -= 1);`
54
55	`if (init) {`
56	`pulse.visit(pulse.SOURCE, add);`
57	`} else {`
58	`pulse.visit(pulse.ADD, add);`
59	`pulse.visit(pulse.REM, rem);`
60	`}`
61
62	`return this._finish(pulse, as); // generate output tuples`
63	`},`
64
65	`_parameterCheck(_, pulse) {`
66	`let init = false;`
67
68	`if (_.modified('stopwords') \|\| !this._stop) {`
69	`this._stop = new RegExp('^' + (_.stopwords \|\| '') + '$', 'i');`
70	`init = true;`
71	`}`
72
73	`if (_.modified('pattern') \|\| !this._match) {`
74	`this._match = new RegExp((_.pattern \|\| '[\\w\']+'), 'g');`
75	`init = true;`
76	`}`
77
78	`if (_.modified('field') \|\| pulse.modified(_.field.fields)) {`
79	`init = true;`
80	`}`
81
82	`if (init) this._counts = {};`
83	`return init;`
84	`},`
85
86	`_finish(pulse, as) {`
87	`const counts = this._counts,`
88	`tuples = this._tuples \|\| (this._tuples = {}),`
89	`text = as[0],`
90	`count = as[1],`
91	`out = pulse.fork(pulse.NO_SOURCE \| pulse.NO_FIELDS);`
92	`let w, t, c;`
93
94	`for (w in counts) {`
95	`t = tuples[w];`
96	`c = counts[w] \|\| 0;`
97	`if (!t && c) {`
98	`tuples[w] = (t = ingest({}));`
99	`t[text] = w;`
100	`t[count] = c;`
101	`out.add.push(t);`
102	`} else if (c === 0) {`
103	`if (t) out.rem.push(t);`
104	`counts[w] = null;`
105	`tuples[w] = null;`
106	`} else if (t[count] !== c) {`
107	`t[count] = c;`
108	`out.mod.push(t);`
109	`}`
110	`}`
111
112	`return out.modifies(as);`
113	`}`
114	`});`