1 | ;
|
2 |
|
3 | Object.defineProperty(exports, '__esModule', { value: true });
|
4 |
|
5 | /**
|
6 | * [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
|
7 | * is a simple way to find a fitted line
|
8 | * between a set of coordinates. This algorithm finds the slope and y-intercept of a regression line
|
9 | * using the least sum of squares.
|
10 | *
|
11 | * @param {Array<Array<number>>} data an array of two-element of arrays,
|
12 | * like `[[0, 1], [2, 3]]`
|
13 | * @returns {Object} object containing slope and intersect of regression line
|
14 | * @example
|
15 | * linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 }
|
16 | */
|
17 | function linearRegression(data) {
|
18 | var m, b;
|
19 |
|
20 | // Store data length in a local variable to reduce
|
21 | // repeated object property lookups
|
22 | var dataLength = data.length;
|
23 |
|
24 | //if there's only one point, arbitrarily choose a slope of 0
|
25 | //and a y-intercept of whatever the y of the initial point is
|
26 | if (dataLength === 1) {
|
27 | m = 0;
|
28 | b = data[0][1];
|
29 | } else {
|
30 | // Initialize our sums and scope the `m` and `b`
|
31 | // variables that define the line.
|
32 | var sumX = 0,
|
33 | sumY = 0,
|
34 | sumXX = 0,
|
35 | sumXY = 0;
|
36 |
|
37 | // Use local variables to grab point values
|
38 | // with minimal object property lookups
|
39 | var point, x, y;
|
40 |
|
41 | // Gather the sum of all x values, the sum of all
|
42 | // y values, and the sum of x^2 and (x*y) for each
|
43 | // value.
|
44 | //
|
45 | // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
|
46 | for (var i = 0; i < dataLength; i++) {
|
47 | point = data[i];
|
48 | x = point[0];
|
49 | y = point[1];
|
50 |
|
51 | sumX += x;
|
52 | sumY += y;
|
53 |
|
54 | sumXX += x * x;
|
55 | sumXY += x * y;
|
56 | }
|
57 |
|
58 | // `m` is the slope of the regression line
|
59 | m =
|
60 | (dataLength * sumXY - sumX * sumY) /
|
61 | (dataLength * sumXX - sumX * sumX);
|
62 |
|
63 | // `b` is the y-intercept of the line.
|
64 | b = sumY / dataLength - (m * sumX) / dataLength;
|
65 | }
|
66 |
|
67 | // Return both values as an object.
|
68 | return {
|
69 | m: m,
|
70 | b: b
|
71 | };
|
72 | }
|
73 |
|
74 | /**
|
75 | * Given the output of `linearRegression`: an object
|
76 | * with `m` and `b` values indicating slope and intercept,
|
77 | * respectively, generate a line function that translates
|
78 | * x values into y values.
|
79 | *
|
80 | * @param {Object} mb object with `m` and `b` members, representing
|
81 | * slope and intersect of desired line
|
82 | * @returns {Function} method that computes y-value at any given
|
83 | * x-value on the line.
|
84 | * @example
|
85 | * var l = linearRegressionLine(linearRegression([[0, 0], [1, 1]]));
|
86 | * l(0) // = 0
|
87 | * l(2) // = 2
|
88 | * linearRegressionLine({ b: 0, m: 1 })(1); // => 1
|
89 | * linearRegressionLine({ b: 1, m: 1 })(1); // => 2
|
90 | */
|
91 | function linearRegressionLine(mb /*: { b: number, m: number }*/) {
|
92 | // Return a function that computes a `y` value for each
|
93 | // x value it is given, based on the values of `b` and `a`
|
94 | // that we just computed.
|
95 | return function (x) {
|
96 | return mb.b + mb.m * x;
|
97 | };
|
98 | }
|
99 |
|
100 | /**
|
101 | * Our default sum is the [Kahan-Babuska algorithm](https://pdfs.semanticscholar.org/1760/7d467cda1d0277ad272deb2113533131dc09.pdf).
|
102 | * This method is an improvement over the classical
|
103 | * [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
|
104 | * It aims at computing the sum of a list of numbers while correcting for
|
105 | * floating-point errors. Traditionally, sums are calculated as many
|
106 | * successive additions, each one with its own floating-point roundoff. These
|
107 | * losses in precision add up as the number of numbers increases. This alternative
|
108 | * algorithm is more accurate than the simple way of calculating sums by simple
|
109 | * addition.
|
110 | *
|
111 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
112 | *
|
113 | * @param {Array<number>} x input
|
114 | * @return {number} sum of all input numbers
|
115 | * @example
|
116 | * sum([1, 2, 3]); // => 6
|
117 | */
|
118 | function sum(x) {
|
119 | // If the array is empty, we needn't bother computing its sum
|
120 | if (x.length === 0) {
|
121 | return 0;
|
122 | }
|
123 |
|
124 | // Initializing the sum as the first number in the array
|
125 | var sum = x[0];
|
126 |
|
127 | // Keeping track of the floating-point error correction
|
128 | var correction = 0;
|
129 |
|
130 | var transition;
|
131 |
|
132 | for (var i = 1; i < x.length; i++) {
|
133 | transition = sum + x[i];
|
134 |
|
135 | // Here we need to update the correction in a different fashion
|
136 | // if the new absolute value is greater than the absolute sum
|
137 | if (Math.abs(sum) >= Math.abs(x[i])) {
|
138 | correction += sum - transition + x[i];
|
139 | } else {
|
140 | correction += x[i] - transition + sum;
|
141 | }
|
142 |
|
143 | sum = transition;
|
144 | }
|
145 |
|
146 | // Returning the corrected sum
|
147 | return sum + correction;
|
148 | }
|
149 |
|
150 | /**
|
151 | * The mean, _also known as average_,
|
152 | * is the sum of all values over the number of values.
|
153 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
154 | * a method of finding a typical or central value of a set of numbers.
|
155 | *
|
156 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
157 | *
|
158 | * @param {Array<number>} x sample of one or more data points
|
159 | * @throws {Error} if the the length of x is less than one
|
160 | * @returns {number} mean
|
161 | * @example
|
162 | * mean([0, 10]); // => 5
|
163 | */
|
164 | function mean(x) {
|
165 | if (x.length === 0) {
|
166 | throw new Error("mean requires at least one data point");
|
167 | }
|
168 |
|
169 | return sum(x) / x.length;
|
170 | }
|
171 |
|
172 | /**
|
173 | * The sum of deviations to the Nth power.
|
174 | * When n=2 it's the sum of squared deviations.
|
175 | * When n=3 it's the sum of cubed deviations.
|
176 | *
|
177 | * @param {Array<number>} x
|
178 | * @param {number} n power
|
179 | * @returns {number} sum of nth power deviations
|
180 | *
|
181 | * @example
|
182 | * var input = [1, 2, 3];
|
183 | * // since the variance of a set is the mean squared
|
184 | * // deviations, we can calculate that with sumNthPowerDeviations:
|
185 | * sumNthPowerDeviations(input, 2) / input.length;
|
186 | */
|
187 | function sumNthPowerDeviations(x, n) {
|
188 | var meanValue = mean(x);
|
189 | var sum = 0;
|
190 | var tempValue;
|
191 | var i;
|
192 |
|
193 | // This is an optimization: when n is 2 (we're computing a number squared),
|
194 | // multiplying the number by itself is significantly faster than using
|
195 | // the Math.pow method.
|
196 | if (n === 2) {
|
197 | for (i = 0; i < x.length; i++) {
|
198 | tempValue = x[i] - meanValue;
|
199 | sum += tempValue * tempValue;
|
200 | }
|
201 | } else {
|
202 | for (i = 0; i < x.length; i++) {
|
203 | sum += Math.pow(x[i] - meanValue, n);
|
204 | }
|
205 | }
|
206 |
|
207 | return sum;
|
208 | }
|
209 |
|
210 | /**
|
211 | * The [variance](http://en.wikipedia.org/wiki/Variance)
|
212 | * is the sum of squared deviations from the mean.
|
213 | *
|
214 | * This is an implementation of variance, not sample variance:
|
215 | * see the `sampleVariance` method if you want a sample measure.
|
216 | *
|
217 | * @param {Array<number>} x a population of one or more data points
|
218 | * @returns {number} variance: a value greater than or equal to zero.
|
219 | * zero indicates that all values are identical.
|
220 | * @throws {Error} if x's length is 0
|
221 | * @example
|
222 | * variance([1, 2, 3, 4, 5, 6]); // => 2.9166666666666665
|
223 | */
|
224 | function variance(x) {
|
225 | if (x.length === 0) {
|
226 | throw new Error("variance requires at least one data point");
|
227 | }
|
228 |
|
229 | // Find the mean of squared deviations between the
|
230 | // mean value and each value.
|
231 | return sumNthPowerDeviations(x, 2) / x.length;
|
232 | }
|
233 |
|
234 | /**
|
235 | * The [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
|
236 | * is the square root of the variance. This is also known as the population
|
237 | * standard deviation. It's useful for measuring the amount
|
238 | * of variation or dispersion in a set of values.
|
239 | *
|
240 | * Standard deviation is only appropriate for full-population knowledge: for
|
241 | * samples of a population, {@link sampleStandardDeviation} is
|
242 | * more appropriate.
|
243 | *
|
244 | * @param {Array<number>} x input
|
245 | * @returns {number} standard deviation
|
246 | * @example
|
247 | * variance([2, 4, 4, 4, 5, 5, 7, 9]); // => 4
|
248 | * standardDeviation([2, 4, 4, 4, 5, 5, 7, 9]); // => 2
|
249 | */
|
250 | function standardDeviation(x) {
|
251 | if (x.length === 1) {
|
252 | return 0;
|
253 | }
|
254 | var v = variance(x);
|
255 | return Math.sqrt(v);
|
256 | }
|
257 |
|
258 | /**
|
259 | * The [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
|
260 | * value of data compared with a function `f`
|
261 | * is the sum of the squared differences between the prediction
|
262 | * and the actual value.
|
263 | *
|
264 | * @param {Array<Array<number>>} x input data: this should be doubly-nested
|
265 | * @param {Function} func function called on `[i][0]` values within the dataset
|
266 | * @returns {number} r-squared value
|
267 | * @example
|
268 | * var samples = [[0, 0], [1, 1]];
|
269 | * var regressionLine = linearRegressionLine(linearRegression(samples));
|
270 | * rSquared(samples, regressionLine); // = 1 this line is a perfect fit
|
271 | */
|
272 | function rSquared(x, func) {
|
273 | if (x.length < 2) {
|
274 | return 1;
|
275 | }
|
276 |
|
277 | // Compute the average y value for the actual
|
278 | // data set in order to compute the
|
279 | // _total sum of squares_
|
280 | var sum = 0;
|
281 | for (var i = 0; i < x.length; i++) {
|
282 | sum += x[i][1];
|
283 | }
|
284 | var average = sum / x.length;
|
285 |
|
286 | // Compute the total sum of squares - the
|
287 | // squared difference between each point
|
288 | // and the average of all points.
|
289 | var sumOfSquares = 0;
|
290 | for (var j = 0; j < x.length; j++) {
|
291 | sumOfSquares += Math.pow(average - x[j][1], 2);
|
292 | }
|
293 |
|
294 | // Finally estimate the error: the squared
|
295 | // difference between the estimate and the actual data
|
296 | // value at each point.
|
297 | var err = 0;
|
298 | for (var k = 0; k < x.length; k++) {
|
299 | err += Math.pow(x[k][1] - func(x[k][0]), 2);
|
300 | }
|
301 |
|
302 | // As the error grows larger, its ratio to the
|
303 | // sum of squares increases and the r squared
|
304 | // value grows lower.
|
305 | return 1 - err / sumOfSquares;
|
306 | }
|
307 |
|
308 | /**
|
309 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
310 | * that appears in a list the highest number of times.
|
311 | * There can be multiple modes in a list: in the event of a tie, this
|
312 | * algorithm will return the most recently seen mode.
|
313 | *
|
314 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
315 | * a method of finding a typical or central value of a set of numbers.
|
316 | *
|
317 | * This runs in `O(n)` because the input is sorted.
|
318 | *
|
319 | * @param {Array<number>} sorted a sample of one or more data points
|
320 | * @returns {number} mode
|
321 | * @throws {Error} if sorted is empty
|
322 | * @example
|
323 | * modeSorted([0, 0, 1]); // => 0
|
324 | */
|
325 | function modeSorted(sorted) {
|
326 | // Handle edge cases:
|
327 | // The mode of an empty list is undefined
|
328 | if (sorted.length === 0) {
|
329 | throw new Error("mode requires at least one data point");
|
330 | } else if (sorted.length === 1) {
|
331 | return sorted[0];
|
332 | }
|
333 |
|
334 | // This assumes it is dealing with an array of size > 1, since size
|
335 | // 0 and 1 are handled immediately. Hence it starts at index 1 in the
|
336 | // array.
|
337 | var last = sorted[0],
|
338 | // store the mode as we find new modes
|
339 | value = NaN,
|
340 | // store how many times we've seen the mode
|
341 | maxSeen = 0,
|
342 | // how many times the current candidate for the mode
|
343 | // has been seen
|
344 | seenThis = 1;
|
345 |
|
346 | // end at sorted.length + 1 to fix the case in which the mode is
|
347 | // the highest number that occurs in the sequence. the last iteration
|
348 | // compares sorted[i], which is undefined, to the highest number
|
349 | // in the series
|
350 | for (var i = 1; i < sorted.length + 1; i++) {
|
351 | // we're seeing a new number pass by
|
352 | if (sorted[i] !== last) {
|
353 | // the last number is the new mode since we saw it more
|
354 | // often than the old one
|
355 | if (seenThis > maxSeen) {
|
356 | maxSeen = seenThis;
|
357 | value = last;
|
358 | }
|
359 | seenThis = 1;
|
360 | last = sorted[i];
|
361 | // if this isn't a new number, it's one more occurrence of
|
362 | // the potential mode
|
363 | } else {
|
364 | seenThis++;
|
365 | }
|
366 | }
|
367 | return value;
|
368 | }
|
369 |
|
370 | /**
|
371 | * Sort an array of numbers by their numeric value, ensuring that the
|
372 | * array is not changed in place.
|
373 | *
|
374 | * This is necessary because the default behavior of .sort
|
375 | * in JavaScript is to sort arrays as string values
|
376 | *
|
377 | * [1, 10, 12, 102, 20].sort()
|
378 | * // output
|
379 | * [1, 10, 102, 12, 20]
|
380 | *
|
381 | * @param {Array<number>} x input array
|
382 | * @return {Array<number>} sorted array
|
383 | * @private
|
384 | * @example
|
385 | * numericSort([3, 2, 1]) // => [1, 2, 3]
|
386 | */
|
387 | function numericSort(x) {
|
388 | return (
|
389 | x
|
390 | // ensure the array is not changed in-place
|
391 | .slice()
|
392 | // comparator function that treats input as numeric
|
393 | .sort(function (a, b) {
|
394 | return a - b;
|
395 | })
|
396 | );
|
397 | }
|
398 |
|
399 | /**
|
400 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
401 | * that appears in a list the highest number of times.
|
402 | * There can be multiple modes in a list: in the event of a tie, this
|
403 | * algorithm will return the most recently seen mode.
|
404 | *
|
405 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
406 | * a method of finding a typical or central value of a set of numbers.
|
407 | *
|
408 | * This runs in `O(n log(n))` because it needs to sort the array internally
|
409 | * before running an `O(n)` search to find the mode.
|
410 | *
|
411 | * @param {Array<number>} x input
|
412 | * @returns {number} mode
|
413 | * @example
|
414 | * mode([0, 0, 1]); // => 0
|
415 | */
|
416 | function mode(x) {
|
417 | // Sorting the array lets us iterate through it below and be sure
|
418 | // that every time we see a new number it's new and we'll never
|
419 | // see the same number twice
|
420 | return modeSorted(numericSort(x));
|
421 | }
|
422 |
|
423 | /* globals Map: false */
|
424 |
|
425 | /**
|
426 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
427 | * that appears in a list the highest number of times.
|
428 | * There can be multiple modes in a list: in the event of a tie, this
|
429 | * algorithm will return the most recently seen mode.
|
430 | *
|
431 | * modeFast uses a Map object to keep track of the mode, instead of the approach
|
432 | * used with `mode`, a sorted array. As a result, it is faster
|
433 | * than `mode` and supports any data type that can be compared with `==`.
|
434 | * It also requires a
|
435 | * [JavaScript environment with support for Map](https://kangax.github.io/compat-table/es6/#test-Map),
|
436 | * and will throw an error if Map is not available.
|
437 | *
|
438 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
439 | * a method of finding a typical or central value of a set of numbers.
|
440 | *
|
441 | * @param {Array<*>} x a sample of one or more data points
|
442 | * @returns {?*} mode
|
443 | * @throws {ReferenceError} if the JavaScript environment doesn't support Map
|
444 | * @throws {Error} if x is empty
|
445 | * @example
|
446 | * modeFast(['rabbits', 'rabbits', 'squirrels']); // => 'rabbits'
|
447 | */
|
448 | function modeFast(x) {
|
449 | // This index will reflect the incidence of different values, indexing
|
450 | // them like
|
451 | // { value: count }
|
452 | var index = new Map();
|
453 |
|
454 | // A running `mode` and the number of times it has been encountered.
|
455 | var mode;
|
456 | var modeCount = 0;
|
457 |
|
458 | for (var i = 0; i < x.length; i++) {
|
459 | var newCount = index.get(x[i]);
|
460 | if (newCount === undefined) {
|
461 | newCount = 1;
|
462 | } else {
|
463 | newCount++;
|
464 | }
|
465 | if (newCount > modeCount) {
|
466 | mode = x[i];
|
467 | modeCount = newCount;
|
468 | }
|
469 | index.set(x[i], newCount);
|
470 | }
|
471 |
|
472 | if (modeCount === 0) {
|
473 | throw new Error("mode requires at last one data point");
|
474 | }
|
475 |
|
476 | return mode;
|
477 | }
|
478 |
|
479 | /**
|
480 | * The min is the lowest number in the array.
|
481 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
482 | *
|
483 | * @param {Array<number>} x sample of one or more data points
|
484 | * @throws {Error} if the the length of x is less than one
|
485 | * @returns {number} minimum value
|
486 | * @example
|
487 | * min([1, 5, -10, 100, 2]); // => -10
|
488 | */
|
489 | function min(x) {
|
490 | if (x.length === 0) {
|
491 | throw new Error("min requires at least one data point");
|
492 | }
|
493 |
|
494 | var value = x[0];
|
495 | for (var i = 1; i < x.length; i++) {
|
496 | if (x[i] < value) {
|
497 | value = x[i];
|
498 | }
|
499 | }
|
500 | return value;
|
501 | }
|
502 |
|
503 | /**
|
504 | * This computes the maximum number in an array.
|
505 | *
|
506 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
507 | *
|
508 | * @param {Array<number>} x sample of one or more data points
|
509 | * @returns {number} maximum value
|
510 | * @throws {Error} if the the length of x is less than one
|
511 | * @example
|
512 | * max([1, 2, 3, 4]);
|
513 | * // => 4
|
514 | */
|
515 | function max(x) {
|
516 | if (x.length === 0) {
|
517 | throw new Error("max requires at least one data point");
|
518 | }
|
519 |
|
520 | var value = x[0];
|
521 | for (var i = 1; i < x.length; i++) {
|
522 | if (x[i] > value) {
|
523 | value = x[i];
|
524 | }
|
525 | }
|
526 | return value;
|
527 | }
|
528 |
|
529 | /**
|
530 | * This computes the minimum & maximum number in an array.
|
531 | *
|
532 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
533 | *
|
534 | * @param {Array<number>} x sample of one or more data points
|
535 | * @returns {Array<number>} minimum & maximum value
|
536 | * @throws {Error} if the the length of x is less than one
|
537 | * @example
|
538 | * extent([1, 2, 3, 4]);
|
539 | * // => [1, 4]
|
540 | */
|
541 | function extent(x) {
|
542 | if (x.length === 0) {
|
543 | throw new Error("extent requires at least one data point");
|
544 | }
|
545 |
|
546 | var min = x[0];
|
547 | var max = x[0];
|
548 | for (var i = 1; i < x.length; i++) {
|
549 | if (x[i] > max) {
|
550 | max = x[i];
|
551 | }
|
552 | if (x[i] < min) {
|
553 | min = x[i];
|
554 | }
|
555 | }
|
556 | return [min, max];
|
557 | }
|
558 |
|
559 | /**
|
560 | * The minimum is the lowest number in the array. With a sorted array,
|
561 | * the first element in the array is always the smallest, so this calculation
|
562 | * can be done in one step, or constant time.
|
563 | *
|
564 | * @param {Array<number>} x input
|
565 | * @returns {number} minimum value
|
566 | * @example
|
567 | * minSorted([-100, -10, 1, 2, 5]); // => -100
|
568 | */
|
569 | function minSorted(x) {
|
570 | return x[0];
|
571 | }
|
572 |
|
573 | /**
|
574 | * The maximum is the highest number in the array. With a sorted array,
|
575 | * the last element in the array is always the largest, so this calculation
|
576 | * can be done in one step, or constant time.
|
577 | *
|
578 | * @param {Array<number>} x input
|
579 | * @returns {number} maximum value
|
580 | * @example
|
581 | * maxSorted([-100, -10, 1, 2, 5]); // => 5
|
582 | */
|
583 | function maxSorted(x) {
|
584 | return x[x.length - 1];
|
585 | }
|
586 |
|
587 | /**
|
588 | * The extent is the lowest & highest number in the array. With a sorted array,
|
589 | * the first element in the array is always the lowest while the last element is always the largest, so this calculation
|
590 | * can be done in one step, or constant time.
|
591 | *
|
592 | * @param {Array<number>} x input
|
593 | * @returns {Array<number>} minimum & maximum value
|
594 | * @example
|
595 | * extentSorted([-100, -10, 1, 2, 5]); // => [-100, 5]
|
596 | */
|
597 | function extentSorted(x) {
|
598 | return [x[0], x[x.length - 1]];
|
599 | }
|
600 |
|
601 | /**
|
602 | * The simple [sum](https://en.wikipedia.org/wiki/Summation) of an array
|
603 | * is the result of adding all numbers together, starting from zero.
|
604 | *
|
605 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
606 | *
|
607 | * @param {Array<number>} x input
|
608 | * @return {number} sum of all input numbers
|
609 | * @example
|
610 | * sumSimple([1, 2, 3]); // => 6
|
611 | */
|
612 | function sumSimple(x) {
|
613 | var value = 0;
|
614 | for (var i = 0; i < x.length; i++) {
|
615 | value += x[i];
|
616 | }
|
617 | return value;
|
618 | }
|
619 |
|
620 | /**
|
621 | * The [product](https://en.wikipedia.org/wiki/Product_(mathematics)) of an array
|
622 | * is the result of multiplying all numbers together, starting using one as the multiplicative identity.
|
623 | *
|
624 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
625 | *
|
626 | * @param {Array<number>} x input
|
627 | * @return {number} product of all input numbers
|
628 | * @example
|
629 | * product([1, 2, 3, 4]); // => 24
|
630 | */
|
631 | function product(x) {
|
632 | var value = 1;
|
633 | for (var i = 0; i < x.length; i++) {
|
634 | value *= x[i];
|
635 | }
|
636 | return value;
|
637 | }
|
638 |
|
639 | /**
|
640 | * This is the internal implementation of quantiles: when you know
|
641 | * that the order is sorted, you don't need to re-sort it, and the computations
|
642 | * are faster.
|
643 | *
|
644 | * @param {Array<number>} x sample of one or more data points
|
645 | * @param {number} p desired quantile: a number between 0 to 1, inclusive
|
646 | * @returns {number} quantile value
|
647 | * @throws {Error} if p ix outside of the range from 0 to 1
|
648 | * @throws {Error} if x is empty
|
649 | * @example
|
650 | * quantileSorted([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
|
651 | */
|
652 | function quantileSorted(x, p) {
|
653 | var idx = x.length * p;
|
654 | if (x.length === 0) {
|
655 | throw new Error("quantile requires at least one data point.");
|
656 | } else if (p < 0 || p > 1) {
|
657 | throw new Error("quantiles must be between 0 and 1");
|
658 | } else if (p === 1) {
|
659 | // If p is 1, directly return the last element
|
660 | return x[x.length - 1];
|
661 | } else if (p === 0) {
|
662 | // If p is 0, directly return the first element
|
663 | return x[0];
|
664 | } else if (idx % 1 !== 0) {
|
665 | // If p is not integer, return the next element in array
|
666 | return x[Math.ceil(idx) - 1];
|
667 | } else if (x.length % 2 === 0) {
|
668 | // If the list has even-length, we'll take the average of this number
|
669 | // and the next value, if there is one
|
670 | return (x[idx - 1] + x[idx]) / 2;
|
671 | } else {
|
672 | // Finally, in the simple case of an integer value
|
673 | // with an odd-length list, return the x value at the index.
|
674 | return x[idx];
|
675 | }
|
676 | }
|
677 |
|
678 | /**
|
679 | * Rearrange items in `arr` so that all items in `[left, k]` range are the smallest.
|
680 | * The `k`-th element will have the `(k - left + 1)`-th smallest value in `[left, right]`.
|
681 | *
|
682 | * Implements Floyd-Rivest selection algorithm https://en.wikipedia.org/wiki/Floyd-Rivest_algorithm
|
683 | *
|
684 | * @param {Array<number>} arr input array
|
685 | * @param {number} k pivot index
|
686 | * @param {number} [left] left index
|
687 | * @param {number} [right] right index
|
688 | * @returns {void} mutates input array
|
689 | * @example
|
690 | * var arr = [65, 28, 59, 33, 21, 56, 22, 95, 50, 12, 90, 53, 28, 77, 39];
|
691 | * quickselect(arr, 8);
|
692 | * // = [39, 28, 28, 33, 21, 12, 22, 50, 53, 56, 59, 65, 90, 77, 95]
|
693 | */
|
694 | function quickselect(arr, k, left, right) {
|
695 | left = left || 0;
|
696 | right = right || arr.length - 1;
|
697 |
|
698 | while (right > left) {
|
699 | // 600 and 0.5 are arbitrary constants chosen in the original paper to minimize execution time
|
700 | if (right - left > 600) {
|
701 | var n = right - left + 1;
|
702 | var m = k - left + 1;
|
703 | var z = Math.log(n);
|
704 | var s = 0.5 * Math.exp((2 * z) / 3);
|
705 | var sd = 0.5 * Math.sqrt((z * s * (n - s)) / n);
|
706 | if (m - n / 2 < 0) { sd *= -1; }
|
707 | var newLeft = Math.max(left, Math.floor(k - (m * s) / n + sd));
|
708 | var newRight = Math.min(
|
709 | right,
|
710 | Math.floor(k + ((n - m) * s) / n + sd)
|
711 | );
|
712 | quickselect(arr, k, newLeft, newRight);
|
713 | }
|
714 |
|
715 | var t = arr[k];
|
716 | var i = left;
|
717 | var j = right;
|
718 |
|
719 | swap(arr, left, k);
|
720 | if (arr[right] > t) { swap(arr, left, right); }
|
721 |
|
722 | while (i < j) {
|
723 | swap(arr, i, j);
|
724 | i++;
|
725 | j--;
|
726 | while (arr[i] < t) { i++; }
|
727 | while (arr[j] > t) { j--; }
|
728 | }
|
729 |
|
730 | if (arr[left] === t) { swap(arr, left, j); }
|
731 | else {
|
732 | j++;
|
733 | swap(arr, j, right);
|
734 | }
|
735 |
|
736 | if (j <= k) { left = j + 1; }
|
737 | if (k <= j) { right = j - 1; }
|
738 | }
|
739 | }
|
740 |
|
741 | function swap(arr, i, j) {
|
742 | var tmp = arr[i];
|
743 | arr[i] = arr[j];
|
744 | arr[j] = tmp;
|
745 | }
|
746 |
|
747 | /**
|
748 | * The [quantile](https://en.wikipedia.org/wiki/Quantile):
|
749 | * this is a population quantile, since we assume to know the entire
|
750 | * dataset in this library. This is an implementation of the
|
751 | * [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
|
752 | * algorithm from wikipedia.
|
753 | *
|
754 | * Sample is a one-dimensional array of numbers,
|
755 | * and p is either a decimal number from 0 to 1 or an array of decimal
|
756 | * numbers from 0 to 1.
|
757 | * In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing
|
758 | * with decimal values.
|
759 | * When p is an array, the result of the function is also an array containing the appropriate
|
760 | * quantiles in input order
|
761 | *
|
762 | * @param {Array<number>} x sample of one or more numbers
|
763 | * @param {Array<number> | number} p the desired quantile, as a number between 0 and 1
|
764 | * @returns {number} quantile
|
765 | * @example
|
766 | * quantile([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
|
767 | */
|
768 | function quantile(x, p) {
|
769 | var copy = x.slice();
|
770 |
|
771 | if (Array.isArray(p)) {
|
772 | // rearrange elements so that each element corresponding to a requested
|
773 | // quantile is on a place it would be if the array was fully sorted
|
774 | multiQuantileSelect(copy, p);
|
775 | // Initialize the result array
|
776 | var results = [];
|
777 | // For each requested quantile
|
778 | for (var i = 0; i < p.length; i++) {
|
779 | results[i] = quantileSorted(copy, p[i]);
|
780 | }
|
781 | return results;
|
782 | } else {
|
783 | var idx = quantileIndex(copy.length, p);
|
784 | quantileSelect(copy, idx, 0, copy.length - 1);
|
785 | return quantileSorted(copy, p);
|
786 | }
|
787 | }
|
788 |
|
789 | function quantileSelect(arr, k, left, right) {
|
790 | if (k % 1 === 0) {
|
791 | quickselect(arr, k, left, right);
|
792 | } else {
|
793 | k = Math.floor(k);
|
794 | quickselect(arr, k, left, right);
|
795 | quickselect(arr, k + 1, k + 1, right);
|
796 | }
|
797 | }
|
798 |
|
799 | function multiQuantileSelect(arr, p) {
|
800 | var indices = [0];
|
801 | for (var i = 0; i < p.length; i++) {
|
802 | indices.push(quantileIndex(arr.length, p[i]));
|
803 | }
|
804 | indices.push(arr.length - 1);
|
805 | indices.sort(compare);
|
806 |
|
807 | var stack = [0, indices.length - 1];
|
808 |
|
809 | while (stack.length) {
|
810 | var r = Math.ceil(stack.pop());
|
811 | var l = Math.floor(stack.pop());
|
812 | if (r - l <= 1) { continue; }
|
813 |
|
814 | var m = Math.floor((l + r) / 2);
|
815 | quantileSelect(
|
816 | arr,
|
817 | indices[m],
|
818 | Math.floor(indices[l]),
|
819 | Math.ceil(indices[r])
|
820 | );
|
821 |
|
822 | stack.push(l, m, m, r);
|
823 | }
|
824 | }
|
825 |
|
826 | function compare(a, b) {
|
827 | return a - b;
|
828 | }
|
829 |
|
830 | function quantileIndex(len, p) {
|
831 | var idx = len * p;
|
832 | if (p === 1) {
|
833 | // If p is 1, directly return the last index
|
834 | return len - 1;
|
835 | } else if (p === 0) {
|
836 | // If p is 0, directly return the first index
|
837 | return 0;
|
838 | } else if (idx % 1 !== 0) {
|
839 | // If index is not integer, return the next index in array
|
840 | return Math.ceil(idx) - 1;
|
841 | } else if (len % 2 === 0) {
|
842 | // If the list has even-length, we'll return the middle of two indices
|
843 | // around quantile to indicate that we need an average value of the two
|
844 | return idx - 0.5;
|
845 | } else {
|
846 | // Finally, in the simple case of an integer index
|
847 | // with an odd-length list, return the index
|
848 | return idx;
|
849 | }
|
850 | }
|
851 |
|
852 | /* eslint no-bitwise: 0 */
|
853 |
|
854 | /**
|
855 | * This function returns the quantile in which one would find the given value in
|
856 | * the given array. With a sorted array, leveraging binary search, we can find
|
857 | * this information in logarithmic time.
|
858 | *
|
859 | * @param {Array<number>} x input
|
860 | * @returns {number} value value
|
861 | * @example
|
862 | * quantileRankSorted([1, 2, 3, 4], 3); // => 0.75
|
863 | * quantileRankSorted([1, 2, 3, 3, 4], 3); // => 0.7
|
864 | * quantileRankSorted([1, 2, 3, 4], 6); // => 1
|
865 | * quantileRankSorted([1, 2, 3, 3, 5], 4); // => 0.8
|
866 | */
|
867 | function quantileRankSorted(x, value) {
|
868 | // Value is lesser than any value in the array
|
869 | if (value < x[0]) {
|
870 | return 0;
|
871 | }
|
872 |
|
873 | // Value is greater than any value in the array
|
874 | if (value > x[x.length - 1]) {
|
875 | return 1;
|
876 | }
|
877 |
|
878 | var l = lowerBound(x, value);
|
879 |
|
880 | // Value is not in the array
|
881 | if (x[l] !== value) {
|
882 | return l / x.length;
|
883 | }
|
884 |
|
885 | l++;
|
886 |
|
887 | var u = upperBound(x, value);
|
888 |
|
889 | // The value exists only once in the array
|
890 | if (u === l) {
|
891 | return l / x.length;
|
892 | }
|
893 |
|
894 | // Here, we are basically computing the mean of the range of indices
|
895 | // containing our searched value. But, instead, of initializing an
|
896 | // array and looping over it, there is a dedicated math formula that
|
897 | // we apply below to get the result.
|
898 | var r = u - l + 1;
|
899 | var sum = (r * (u + l)) / 2;
|
900 | var mean = sum / r;
|
901 |
|
902 | return mean / x.length;
|
903 | }
|
904 |
|
905 | function lowerBound(x, value) {
|
906 | var mid = 0;
|
907 | var lo = 0;
|
908 | var hi = x.length;
|
909 |
|
910 | while (lo < hi) {
|
911 | mid = (lo + hi) >>> 1;
|
912 |
|
913 | if (value <= x[mid]) {
|
914 | hi = mid;
|
915 | } else {
|
916 | lo = -~mid;
|
917 | }
|
918 | }
|
919 |
|
920 | return lo;
|
921 | }
|
922 |
|
923 | function upperBound(x, value) {
|
924 | var mid = 0;
|
925 | var lo = 0;
|
926 | var hi = x.length;
|
927 |
|
928 | while (lo < hi) {
|
929 | mid = (lo + hi) >>> 1;
|
930 |
|
931 | if (value >= x[mid]) {
|
932 | lo = -~mid;
|
933 | } else {
|
934 | hi = mid;
|
935 | }
|
936 | }
|
937 |
|
938 | return lo;
|
939 | }
|
940 |
|
941 | /**
|
942 | * This function returns the quantile in which one would find the given value in
|
943 | * the given array. It will copy and sort your array before each run, so
|
944 | * if you know your array is already sorted, you should use `quantileRankSorted`
|
945 | * instead.
|
946 | *
|
947 | * @param {Array<number>} x input
|
948 | * @returns {number} value value
|
949 | * @example
|
950 | * quantileRank([4, 3, 1, 2], 3); // => 0.75
|
951 | * quantileRank([4, 3, 2, 3, 1], 3); // => 0.7
|
952 | * quantileRank([2, 4, 1, 3], 6); // => 1
|
953 | * quantileRank([5, 3, 1, 2, 3], 4); // => 0.8
|
954 | */
|
955 | function quantileRank(x, value) {
|
956 | // Cloning and sorting the array
|
957 | var sortedCopy = numericSort(x);
|
958 |
|
959 | return quantileRankSorted(sortedCopy, value);
|
960 | }
|
961 |
|
962 | /**
|
963 | * The [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range) is
|
964 | * a measure of statistical dispersion, or how scattered, spread, or
|
965 | * concentrated a distribution is. It's computed as the difference between
|
966 | * the third quartile and first quartile.
|
967 | *
|
968 | * @param {Array<number>} x sample of one or more numbers
|
969 | * @returns {number} interquartile range: the span between lower and upper quartile,
|
970 | * 0.25 and 0.75
|
971 | * @example
|
972 | * interquartileRange([0, 1, 2, 3]); // => 2
|
973 | */
|
974 | function interquartileRange(x) {
|
975 | // Interquartile range is the span between the upper quartile,
|
976 | // at `0.75`, and lower quartile, `0.25`
|
977 | var q1 = quantile(x, 0.75);
|
978 | var q2 = quantile(x, 0.25);
|
979 |
|
980 | if (typeof q1 === "number" && typeof q2 === "number") {
|
981 | return q1 - q2;
|
982 | }
|
983 | }
|
984 |
|
985 | /**
|
986 | * The [median](http://en.wikipedia.org/wiki/Median) is
|
987 | * the middle number of a list. This is often a good indicator of 'the middle'
|
988 | * when there are outliers that skew the `mean()` value.
|
989 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
990 | * a method of finding a typical or central value of a set of numbers.
|
991 | *
|
992 | * The median isn't necessarily one of the elements in the list: the value
|
993 | * can be the average of two elements if the list has an even length
|
994 | * and the two central values are different.
|
995 | *
|
996 | * @param {Array<number>} x input
|
997 | * @returns {number} median value
|
998 | * @example
|
999 | * median([10, 2, 5, 100, 2, 1]); // => 3.5
|
1000 | */
|
1001 | function median(x) {
|
1002 | return +quantile(x, 0.5);
|
1003 | }
|
1004 |
|
1005 | /**
|
1006 | * The [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation) is
|
1007 | * a robust measure of statistical
|
1008 | * dispersion. It is more resilient to outliers than the standard deviation.
|
1009 | *
|
1010 | * @param {Array<number>} x input array
|
1011 | * @returns {number} median absolute deviation
|
1012 | * @example
|
1013 | * medianAbsoluteDeviation([1, 1, 2, 2, 4, 6, 9]); // => 1
|
1014 | */
|
1015 | function medianAbsoluteDeviation(x) {
|
1016 | var medianValue = median(x);
|
1017 | var medianAbsoluteDeviations = [];
|
1018 |
|
1019 | // Make a list of absolute deviations from the median
|
1020 | for (var i = 0; i < x.length; i++) {
|
1021 | medianAbsoluteDeviations.push(Math.abs(x[i] - medianValue));
|
1022 | }
|
1023 |
|
1024 | // Find the median value of that list
|
1025 | return median(medianAbsoluteDeviations);
|
1026 | }
|
1027 |
|
1028 | /**
|
1029 | * Split an array into chunks of a specified size. This function
|
1030 | * has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php)
|
1031 | * function, and thus will insert smaller-sized chunks at the end if
|
1032 | * the input size is not divisible by the chunk size.
|
1033 | *
|
1034 | * `x` is expected to be an array, and `chunkSize` a number.
|
1035 | * The `x` array can contain any kind of data.
|
1036 | *
|
1037 | * @param {Array} x a sample
|
1038 | * @param {number} chunkSize size of each output array. must be a positive integer
|
1039 | * @returns {Array<Array>} a chunked array
|
1040 | * @throws {Error} if chunk size is less than 1 or not an integer
|
1041 | * @example
|
1042 | * chunk([1, 2, 3, 4, 5, 6], 2);
|
1043 | * // => [[1, 2], [3, 4], [5, 6]]
|
1044 | */
|
1045 | function chunk(x, chunkSize) {
|
1046 | // a list of result chunks, as arrays in an array
|
1047 | var output = [];
|
1048 |
|
1049 | // `chunkSize` must be zero or higher - otherwise the loop below,
|
1050 | // in which we call `start += chunkSize`, will loop infinitely.
|
1051 | // So, we'll detect and throw in that case to indicate
|
1052 | // invalid input.
|
1053 | if (chunkSize < 1) {
|
1054 | throw new Error("chunk size must be a positive number");
|
1055 | }
|
1056 |
|
1057 | if (Math.floor(chunkSize) !== chunkSize) {
|
1058 | throw new Error("chunk size must be an integer");
|
1059 | }
|
1060 |
|
1061 | // `start` is the index at which `.slice` will start selecting
|
1062 | // new array elements
|
1063 | for (var start = 0; start < x.length; start += chunkSize) {
|
1064 | // for each chunk, slice that part of the array and add it
|
1065 | // to the output. The `.slice` function does not change
|
1066 | // the original array.
|
1067 | output.push(x.slice(start, start + chunkSize));
|
1068 | }
|
1069 | return output;
|
1070 | }
|
1071 |
|
1072 | /**
|
1073 | * Sampling with replacement is a type of sampling that allows the same
|
1074 | * item to be picked out of a population more than once.
|
1075 | *
|
1076 | * @param {Array<*>} x an array of any kind of value
|
1077 | * @param {number} n count of how many elements to take
|
1078 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1079 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1080 | * @return {Array} n sampled items from the population
|
1081 | * @example
|
1082 | * var values = [1, 2, 3, 4];
|
1083 | * sampleWithReplacement(values, 2); // returns 2 random values, like [2, 4];
|
1084 | */
|
1085 | function sampleWithReplacement(x, n, randomSource) {
|
1086 | if (x.length === 0) {
|
1087 | return [];
|
1088 | }
|
1089 |
|
1090 | // a custom random number source can be provided if you want to use
|
1091 | // a fixed seed or another random number generator, like
|
1092 | // [random-js](https://www.npmjs.org/package/random-js)
|
1093 | randomSource = randomSource || Math.random;
|
1094 |
|
1095 | var length = x.length;
|
1096 | var sample = [];
|
1097 |
|
1098 | for (var i = 0; i < n; i++) {
|
1099 | var index = Math.floor(randomSource() * length);
|
1100 |
|
1101 | sample.push(x[index]);
|
1102 | }
|
1103 |
|
1104 | return sample;
|
1105 | }
|
1106 |
|
1107 | /**
|
1108 | * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
|
1109 | * in-place - which means that it **will change the order of the original
|
1110 | * array by reference**.
|
1111 | *
|
1112 | * This is an algorithm that generates a random [permutation](https://en.wikipedia.org/wiki/Permutation)
|
1113 | * of a set.
|
1114 | *
|
1115 | * @param {Array} x sample of one or more numbers
|
1116 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1117 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1118 | * @returns {Array} x
|
1119 | * @example
|
1120 | * var x = [1, 2, 3, 4];
|
1121 | * shuffleInPlace(x);
|
1122 | * // x is shuffled to a value like [2, 1, 4, 3]
|
1123 | */
|
1124 | function shuffleInPlace(x, randomSource) {
|
1125 | // a custom random number source can be provided if you want to use
|
1126 | // a fixed seed or another random number generator, like
|
1127 | // [random-js](https://www.npmjs.org/package/random-js)
|
1128 | randomSource = randomSource || Math.random;
|
1129 |
|
1130 | // store the current length of the x to determine
|
1131 | // when no elements remain to shuffle.
|
1132 | var length = x.length;
|
1133 |
|
1134 | // temporary is used to hold an item when it is being
|
1135 | // swapped between indices.
|
1136 | var temporary;
|
1137 |
|
1138 | // The index to swap at each stage.
|
1139 | var index;
|
1140 |
|
1141 | // While there are still items to shuffle
|
1142 | while (length > 0) {
|
1143 | // chose a random index within the subset of the array
|
1144 | // that is not yet shuffled
|
1145 | index = Math.floor(randomSource() * length--);
|
1146 |
|
1147 | // store the value that we'll move temporarily
|
1148 | temporary = x[length];
|
1149 |
|
1150 | // swap the value at `x[length]` with `x[index]`
|
1151 | x[length] = x[index];
|
1152 | x[index] = temporary;
|
1153 | }
|
1154 |
|
1155 | return x;
|
1156 | }
|
1157 |
|
1158 | /**
|
1159 | * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
|
1160 | * is a fast way to create a random permutation of a finite set. This is
|
1161 | * a function around `shuffle_in_place` that adds the guarantee that
|
1162 | * it will not modify its input.
|
1163 | *
|
1164 | * @param {Array} x sample of 0 or more numbers
|
1165 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1166 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1167 | * @return {Array} shuffled version of input
|
1168 | * @example
|
1169 | * var shuffled = shuffle([1, 2, 3, 4]);
|
1170 | * shuffled; // = [2, 3, 1, 4] or any other random permutation
|
1171 | */
|
1172 | function shuffle(x, randomSource) {
|
1173 | // slice the original array so that it is not modified
|
1174 | var sample = x.slice();
|
1175 |
|
1176 | // and then shuffle that shallow-copied array, in place
|
1177 | return shuffleInPlace(sample, randomSource);
|
1178 | }
|
1179 |
|
1180 | /**
|
1181 | * Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample)
|
1182 | * from a given array of `n` elements.
|
1183 | *
|
1184 | * The sampled values will be in any order, not necessarily the order
|
1185 | * they appear in the input.
|
1186 | *
|
1187 | * @param {Array<any>} x input array. can contain any type
|
1188 | * @param {number} n count of how many elements to take
|
1189 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1190 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1191 | * @return {Array} subset of n elements in original array
|
1192 | *
|
1193 | * @example
|
1194 | * var values = [1, 2, 4, 5, 6, 7, 8, 9];
|
1195 | * sample(values, 3); // returns 3 random values, like [2, 5, 8];
|
1196 | */
|
1197 | function sample(x, n, randomSource) {
|
1198 | // shuffle the original array using a fisher-yates shuffle
|
1199 | var shuffled = shuffle(x, randomSource);
|
1200 |
|
1201 | // and then return a subset of it - the first `n` elements.
|
1202 | return shuffled.slice(0, n);
|
1203 | }
|
1204 |
|
1205 | /**
|
1206 | * Create a new column x row matrix.
|
1207 | *
|
1208 | * @private
|
1209 | * @param {number} columns
|
1210 | * @param {number} rows
|
1211 | * @return {Array<Array<number>>} matrix
|
1212 | * @example
|
1213 | * makeMatrix(10, 10);
|
1214 | */
|
1215 | function makeMatrix(columns, rows) {
|
1216 | var matrix = [];
|
1217 | for (var i = 0; i < columns; i++) {
|
1218 | var column = [];
|
1219 | for (var j = 0; j < rows; j++) {
|
1220 | column.push(0);
|
1221 | }
|
1222 | matrix.push(column);
|
1223 | }
|
1224 | return matrix;
|
1225 | }
|
1226 |
|
1227 | /**
|
1228 | * For a sorted input, counting the number of unique values
|
1229 | * is possible in constant time and constant memory. This is
|
1230 | * a simple implementation of the algorithm.
|
1231 | *
|
1232 | * Values are compared with `===`, so objects and non-primitive objects
|
1233 | * are not handled in any special way.
|
1234 | *
|
1235 | * @param {Array<*>} x an array of any kind of value
|
1236 | * @returns {number} count of unique values
|
1237 | * @example
|
1238 | * uniqueCountSorted([1, 2, 3]); // => 3
|
1239 | * uniqueCountSorted([1, 1, 1]); // => 1
|
1240 | */
|
1241 | function uniqueCountSorted(x) {
|
1242 | var uniqueValueCount = 0,
|
1243 | lastSeenValue;
|
1244 | for (var i = 0; i < x.length; i++) {
|
1245 | if (i === 0 || x[i] !== lastSeenValue) {
|
1246 | lastSeenValue = x[i];
|
1247 | uniqueValueCount++;
|
1248 | }
|
1249 | }
|
1250 | return uniqueValueCount;
|
1251 | }
|
1252 |
|
1253 | /**
|
1254 | * Generates incrementally computed values based on the sums and sums of
|
1255 | * squares for the data array
|
1256 | *
|
1257 | * @private
|
1258 | * @param {number} j
|
1259 | * @param {number} i
|
1260 | * @param {Array<number>} sums
|
1261 | * @param {Array<number>} sumsOfSquares
|
1262 | * @return {number}
|
1263 | * @example
|
1264 | * ssq(0, 1, [-1, 0, 2], [1, 1, 5]);
|
1265 | */
|
1266 | function ssq(j, i, sums, sumsOfSquares) {
|
1267 | var sji; // s(j, i)
|
1268 | if (j > 0) {
|
1269 | var muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i)
|
1270 | sji =
|
1271 | sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji;
|
1272 | } else {
|
1273 | sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1);
|
1274 | }
|
1275 | if (sji < 0) {
|
1276 | return 0;
|
1277 | }
|
1278 | return sji;
|
1279 | }
|
1280 |
|
1281 | /**
|
1282 | * Function that recursively divides and conquers computations
|
1283 | * for cluster j
|
1284 | *
|
1285 | * @private
|
1286 | * @param {number} iMin Minimum index in cluster to be computed
|
1287 | * @param {number} iMax Maximum index in cluster to be computed
|
1288 | * @param {number} cluster Index of the cluster currently being computed
|
1289 | * @param {Array<Array<number>>} matrix
|
1290 | * @param {Array<Array<number>>} backtrackMatrix
|
1291 | * @param {Array<number>} sums
|
1292 | * @param {Array<number>} sumsOfSquares
|
1293 | */
|
1294 | function fillMatrixColumn(
|
1295 | iMin,
|
1296 | iMax,
|
1297 | cluster,
|
1298 | matrix,
|
1299 | backtrackMatrix,
|
1300 | sums,
|
1301 | sumsOfSquares
|
1302 | ) {
|
1303 | if (iMin > iMax) {
|
1304 | return;
|
1305 | }
|
1306 |
|
1307 | // Start at midpoint between iMin and iMax
|
1308 | var i = Math.floor((iMin + iMax) / 2);
|
1309 |
|
1310 | matrix[cluster][i] = matrix[cluster - 1][i - 1];
|
1311 | backtrackMatrix[cluster][i] = i;
|
1312 |
|
1313 | var jlow = cluster; // the lower end for j
|
1314 |
|
1315 | if (iMin > cluster) {
|
1316 | jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0);
|
1317 | }
|
1318 | jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
|
1319 |
|
1320 | var jhigh = i - 1; // the upper end for j
|
1321 | if (iMax < matrix[0].length - 1) {
|
1322 | jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0);
|
1323 | }
|
1324 |
|
1325 | var sji;
|
1326 | var sjlowi;
|
1327 | var ssqjlow;
|
1328 | var ssqj;
|
1329 | for (var j = jhigh; j >= jlow; --j) {
|
1330 | sji = ssq(j, i, sums, sumsOfSquares);
|
1331 |
|
1332 | if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) {
|
1333 | break;
|
1334 | }
|
1335 |
|
1336 | // Examine the lower bound of the cluster border
|
1337 | sjlowi = ssq(jlow, i, sums, sumsOfSquares);
|
1338 |
|
1339 | ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
|
1340 |
|
1341 | if (ssqjlow < matrix[cluster][i]) {
|
1342 | // Shrink the lower bound
|
1343 | matrix[cluster][i] = ssqjlow;
|
1344 | backtrackMatrix[cluster][i] = jlow;
|
1345 | }
|
1346 | jlow++;
|
1347 |
|
1348 | ssqj = sji + matrix[cluster - 1][j - 1];
|
1349 | if (ssqj < matrix[cluster][i]) {
|
1350 | matrix[cluster][i] = ssqj;
|
1351 | backtrackMatrix[cluster][i] = j;
|
1352 | }
|
1353 | }
|
1354 |
|
1355 | fillMatrixColumn(
|
1356 | iMin,
|
1357 | i - 1,
|
1358 | cluster,
|
1359 | matrix,
|
1360 | backtrackMatrix,
|
1361 | sums,
|
1362 | sumsOfSquares
|
1363 | );
|
1364 | fillMatrixColumn(
|
1365 | i + 1,
|
1366 | iMax,
|
1367 | cluster,
|
1368 | matrix,
|
1369 | backtrackMatrix,
|
1370 | sums,
|
1371 | sumsOfSquares
|
1372 | );
|
1373 | }
|
1374 |
|
1375 | /**
|
1376 | * Initializes the main matrices used in Ckmeans and kicks
|
1377 | * off the divide and conquer cluster computation strategy
|
1378 | *
|
1379 | * @private
|
1380 | * @param {Array<number>} data sorted array of values
|
1381 | * @param {Array<Array<number>>} matrix
|
1382 | * @param {Array<Array<number>>} backtrackMatrix
|
1383 | */
|
1384 | function fillMatrices(data, matrix, backtrackMatrix) {
|
1385 | var nValues = matrix[0].length;
|
1386 |
|
1387 | // Shift values by the median to improve numeric stability
|
1388 | var shift = data[Math.floor(nValues / 2)];
|
1389 |
|
1390 | // Cumulative sum and cumulative sum of squares for all values in data array
|
1391 | var sums = [];
|
1392 | var sumsOfSquares = [];
|
1393 |
|
1394 | // Initialize first column in matrix & backtrackMatrix
|
1395 | for (var i = 0, shiftedValue = (void 0); i < nValues; ++i) {
|
1396 | shiftedValue = data[i] - shift;
|
1397 | if (i === 0) {
|
1398 | sums.push(shiftedValue);
|
1399 | sumsOfSquares.push(shiftedValue * shiftedValue);
|
1400 | } else {
|
1401 | sums.push(sums[i - 1] + shiftedValue);
|
1402 | sumsOfSquares.push(
|
1403 | sumsOfSquares[i - 1] + shiftedValue * shiftedValue
|
1404 | );
|
1405 | }
|
1406 |
|
1407 | // Initialize for cluster = 0
|
1408 | matrix[0][i] = ssq(0, i, sums, sumsOfSquares);
|
1409 | backtrackMatrix[0][i] = 0;
|
1410 | }
|
1411 |
|
1412 | // Initialize the rest of the columns
|
1413 | var iMin;
|
1414 | for (var cluster = 1; cluster < matrix.length; ++cluster) {
|
1415 | if (cluster < matrix.length - 1) {
|
1416 | iMin = cluster;
|
1417 | } else {
|
1418 | // No need to compute matrix[K-1][0] ... matrix[K-1][N-2]
|
1419 | iMin = nValues - 1;
|
1420 | }
|
1421 |
|
1422 | fillMatrixColumn(
|
1423 | iMin,
|
1424 | nValues - 1,
|
1425 | cluster,
|
1426 | matrix,
|
1427 | backtrackMatrix,
|
1428 | sums,
|
1429 | sumsOfSquares
|
1430 | );
|
1431 | }
|
1432 | }
|
1433 |
|
1434 | /**
|
1435 | * Ckmeans clustering is an improvement on heuristic-based clustering
|
1436 | * approaches like Jenks. The algorithm was developed in
|
1437 | * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf)
|
1438 | * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach
|
1439 | * to the problem of clustering numeric data into groups with the least
|
1440 | * within-group sum-of-squared-deviations.
|
1441 | *
|
1442 | * Minimizing the difference within groups - what Wang & Song refer to as
|
1443 | * `withinss`, or within sum-of-squares, means that groups are optimally
|
1444 | * homogenous within and the data is split into representative groups.
|
1445 | * This is very useful for visualization, where you may want to represent
|
1446 | * a continuous variable in discrete color or style groups. This function
|
1447 | * can provide groups that emphasize differences between data.
|
1448 | *
|
1449 | * Being a dynamic approach, this algorithm is based on two matrices that
|
1450 | * store incrementally-computed values for squared deviations and backtracking
|
1451 | * indexes.
|
1452 | *
|
1453 | * This implementation is based on Ckmeans 3.4.6, which introduced a new divide
|
1454 | * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)).
|
1455 | *
|
1456 | * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html),
|
1457 | * this implementation does not include any code to automatically determine
|
1458 | * the optimal number of clusters: this information needs to be explicitly
|
1459 | * provided.
|
1460 | *
|
1461 | * ### References
|
1462 | * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic
|
1463 | * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859
|
1464 | *
|
1465 | * from The R Journal Vol. 3/2, December 2011
|
1466 | * @param {Array<number>} x input data, as an array of number values
|
1467 | * @param {number} nClusters number of desired classes. This cannot be
|
1468 | * greater than the number of values in the data array.
|
1469 | * @returns {Array<Array<number>>} clustered input
|
1470 | * @throws {Error} if the number of requested clusters is higher than the size of the data
|
1471 | * @example
|
1472 | * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3);
|
1473 | * // The input, clustered into groups of similar numbers.
|
1474 | * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]);
|
1475 | */
|
1476 | function ckmeans(x, nClusters) {
|
1477 | if (nClusters > x.length) {
|
1478 | throw new Error(
|
1479 | "cannot generate more classes than there are data values"
|
1480 | );
|
1481 | }
|
1482 |
|
1483 | var sorted = numericSort(x);
|
1484 | // we'll use this as the maximum number of clusters
|
1485 | var uniqueCount = uniqueCountSorted(sorted);
|
1486 |
|
1487 | // if all of the input values are identical, there's one cluster
|
1488 | // with all of the input in it.
|
1489 | if (uniqueCount === 1) {
|
1490 | return [sorted];
|
1491 | }
|
1492 |
|
1493 | // named 'S' originally
|
1494 | var matrix = makeMatrix(nClusters, sorted.length);
|
1495 | // named 'J' originally
|
1496 | var backtrackMatrix = makeMatrix(nClusters, sorted.length);
|
1497 |
|
1498 | // This is a dynamic programming way to solve the problem of minimizing
|
1499 | // within-cluster sum of squares. It's similar to linear regression
|
1500 | // in this way, and this calculation incrementally computes the
|
1501 | // sum of squares that are later read.
|
1502 | fillMatrices(sorted, matrix, backtrackMatrix);
|
1503 |
|
1504 | // The real work of Ckmeans clustering happens in the matrix generation:
|
1505 | // the generated matrices encode all possible clustering combinations, and
|
1506 | // once they're generated we can solve for the best clustering groups
|
1507 | // very quickly.
|
1508 | var clusters = [];
|
1509 | var clusterRight = backtrackMatrix[0].length - 1;
|
1510 |
|
1511 | // Backtrack the clusters from the dynamic programming matrix. This
|
1512 | // starts at the bottom-right corner of the matrix (if the top-left is 0, 0),
|
1513 | // and moves the cluster target with the loop.
|
1514 | for (var cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) {
|
1515 | var clusterLeft = backtrackMatrix[cluster][clusterRight];
|
1516 |
|
1517 | // fill the cluster from the sorted input by taking a slice of the
|
1518 | // array. the backtrack matrix makes this easy - it stores the
|
1519 | // indexes where the cluster should start and end.
|
1520 | clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
|
1521 |
|
1522 | if (cluster > 0) {
|
1523 | clusterRight = clusterLeft - 1;
|
1524 | }
|
1525 | }
|
1526 |
|
1527 | return clusters;
|
1528 | }
|
1529 |
|
1530 | /**
|
1531 | * Given an array of x, this will find the extent of the
|
1532 | * x and return an array of breaks that can be used
|
1533 | * to categorize the x into a number of classes. The
|
1534 | * returned array will always be 1 longer than the number of
|
1535 | * classes because it includes the minimum value.
|
1536 | *
|
1537 | * @param {Array<number>} x an array of number values
|
1538 | * @param {number} nClasses number of desired classes
|
1539 | * @returns {Array<number>} array of class break positions
|
1540 | * @example
|
1541 | * equalIntervalBreaks([1, 2, 3, 4, 5, 6], 4); // => [1, 2.25, 3.5, 4.75, 6]
|
1542 | */
|
1543 | function equalIntervalBreaks(x, nClasses) {
|
1544 | if (x.length < 2) {
|
1545 | return x;
|
1546 | }
|
1547 |
|
1548 | var theMin = min(x);
|
1549 | var theMax = max(x);
|
1550 |
|
1551 | // the first break will always be the minimum value
|
1552 | // in the xset
|
1553 | var breaks = [theMin];
|
1554 |
|
1555 | // The size of each break is the full range of the x
|
1556 | // divided by the number of classes requested
|
1557 | var breakSize = (theMax - theMin) / nClasses;
|
1558 |
|
1559 | // In the case of nClasses = 1, this loop won't run
|
1560 | // and the returned breaks will be [min, max]
|
1561 | for (var i = 1; i < nClasses; i++) {
|
1562 | breaks.push(breaks[0] + breakSize * i);
|
1563 | }
|
1564 |
|
1565 | // the last break will always be the
|
1566 | // maximum.
|
1567 | breaks.push(theMax);
|
1568 |
|
1569 | return breaks;
|
1570 | }
|
1571 |
|
1572 | /**
|
1573 | * [Sample covariance](https://en.wikipedia.org/wiki/Sample_mean_and_covariance) of two datasets:
|
1574 | * how much do the two datasets move together?
|
1575 | * x and y are two datasets, represented as arrays of numbers.
|
1576 | *
|
1577 | * @param {Array<number>} x a sample of two or more data points
|
1578 | * @param {Array<number>} y a sample of two or more data points
|
1579 | * @throws {Error} if x and y do not have equal lengths
|
1580 | * @throws {Error} if x or y have length of one or less
|
1581 | * @returns {number} sample covariance
|
1582 | * @example
|
1583 | * sampleCovariance([1, 2, 3, 4, 5, 6], [6, 5, 4, 3, 2, 1]); // => -3.5
|
1584 | */
|
1585 | function sampleCovariance(x, y) {
|
1586 | // The two datasets must have the same length which must be more than 1
|
1587 | if (x.length !== y.length) {
|
1588 | throw new Error("sampleCovariance requires samples with equal lengths");
|
1589 | }
|
1590 |
|
1591 | if (x.length < 2) {
|
1592 | throw new Error(
|
1593 | "sampleCovariance requires at least two data points in each sample"
|
1594 | );
|
1595 | }
|
1596 |
|
1597 | // determine the mean of each dataset so that we can judge each
|
1598 | // value of the dataset fairly as the difference from the mean. this
|
1599 | // way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
|
1600 | // does not suffer because of the difference in absolute values
|
1601 | var xmean = mean(x);
|
1602 | var ymean = mean(y);
|
1603 | var sum = 0;
|
1604 |
|
1605 | // for each pair of values, the covariance increases when their
|
1606 | // difference from the mean is associated - if both are well above
|
1607 | // or if both are well below
|
1608 | // the mean, the covariance increases significantly.
|
1609 | for (var i = 0; i < x.length; i++) {
|
1610 | sum += (x[i] - xmean) * (y[i] - ymean);
|
1611 | }
|
1612 |
|
1613 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1614 | // that allows for the reduced degree of freedom entailed in calculating
|
1615 | // values from samples rather than complete populations.
|
1616 | var besselsCorrection = x.length - 1;
|
1617 |
|
1618 | // the covariance is weighted by the length of the datasets.
|
1619 | return sum / besselsCorrection;
|
1620 | }
|
1621 |
|
1622 | /**
|
1623 | * The [sample variance](https://en.wikipedia.org/wiki/Variance#Sample_variance)
|
1624 | * is the sum of squared deviations from the mean. The sample variance
|
1625 | * is distinguished from the variance by the usage of [Bessel's Correction](https://en.wikipedia.org/wiki/Bessel's_correction):
|
1626 | * instead of dividing the sum of squared deviations by the length of the input,
|
1627 | * it is divided by the length minus one. This corrects the bias in estimating
|
1628 | * a value from a set that you don't know if full.
|
1629 | *
|
1630 | * References:
|
1631 | * * [Wolfram MathWorld on Sample Variance](http://mathworld.wolfram.com/SampleVariance.html)
|
1632 | *
|
1633 | * @param {Array<number>} x a sample of two or more data points
|
1634 | * @throws {Error} if the length of x is less than 2
|
1635 | * @return {number} sample variance
|
1636 | * @example
|
1637 | * sampleVariance([1, 2, 3, 4, 5]); // => 2.5
|
1638 | */
|
1639 | function sampleVariance(x) {
|
1640 | if (x.length < 2) {
|
1641 | throw new Error("sampleVariance requires at least two data points");
|
1642 | }
|
1643 |
|
1644 | var sumSquaredDeviationsValue = sumNthPowerDeviations(x, 2);
|
1645 |
|
1646 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1647 | // that allows for the reduced degree of freedom entailed in calculating
|
1648 | // values from samples rather than complete populations.
|
1649 | var besselsCorrection = x.length - 1;
|
1650 |
|
1651 | // Find the mean value of that list
|
1652 | return sumSquaredDeviationsValue / besselsCorrection;
|
1653 | }
|
1654 |
|
1655 | /**
|
1656 | * The [sample standard deviation](http://en.wikipedia.org/wiki/Standard_deviation#Sample_standard_deviation)
|
1657 | * is the square root of the sample variance.
|
1658 | *
|
1659 | * @param {Array<number>} x input array
|
1660 | * @returns {number} sample standard deviation
|
1661 | * @example
|
1662 | * sampleStandardDeviation([2, 4, 4, 4, 5, 5, 7, 9]).toFixed(2);
|
1663 | * // => '2.14'
|
1664 | */
|
1665 | function sampleStandardDeviation(x) {
|
1666 | var sampleVarianceX = sampleVariance(x);
|
1667 | return Math.sqrt(sampleVarianceX);
|
1668 | }
|
1669 |
|
1670 | /**
|
1671 | * The [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence) is
|
1672 | * a measure of how correlated two datasets are, between -1 and 1
|
1673 | *
|
1674 | * @param {Array<number>} x first input
|
1675 | * @param {Array<number>} y second input
|
1676 | * @returns {number} sample correlation
|
1677 | * @example
|
1678 | * sampleCorrelation([1, 2, 3, 4, 5, 6], [2, 2, 3, 4, 5, 60]).toFixed(2);
|
1679 | * // => '0.69'
|
1680 | */
|
1681 | function sampleCorrelation(x, y) {
|
1682 | var cov = sampleCovariance(x, y);
|
1683 | var xstd = sampleStandardDeviation(x);
|
1684 | var ystd = sampleStandardDeviation(y);
|
1685 |
|
1686 | return cov / xstd / ystd;
|
1687 | }
|
1688 |
|
1689 | /**
|
1690 | * [Skewness](http://en.wikipedia.org/wiki/Skewness) is
|
1691 | * a measure of the extent to which a probability distribution of a
|
1692 | * real-valued random variable "leans" to one side of the mean.
|
1693 | * The skewness value can be positive or negative, or even undefined.
|
1694 | *
|
1695 | * Implementation is based on the adjusted Fisher-Pearson standardized
|
1696 | * moment coefficient, which is the version found in Excel and several
|
1697 | * statistical packages including Minitab, SAS and SPSS.
|
1698 | *
|
1699 | * @since 4.1.0
|
1700 | * @param {Array<number>} x a sample of 3 or more data points
|
1701 | * @returns {number} sample skewness
|
1702 | * @throws {Error} if x has length less than 3
|
1703 | * @example
|
1704 | * sampleSkewness([2, 4, 6, 3, 1]); // => 0.590128656384365
|
1705 | */
|
1706 | function sampleSkewness(x) {
|
1707 | if (x.length < 3) {
|
1708 | throw new Error("sampleSkewness requires at least three data points");
|
1709 | }
|
1710 |
|
1711 | var meanValue = mean(x);
|
1712 | var tempValue;
|
1713 | var sumSquaredDeviations = 0;
|
1714 | var sumCubedDeviations = 0;
|
1715 |
|
1716 | for (var i = 0; i < x.length; i++) {
|
1717 | tempValue = x[i] - meanValue;
|
1718 | sumSquaredDeviations += tempValue * tempValue;
|
1719 | sumCubedDeviations += tempValue * tempValue * tempValue;
|
1720 | }
|
1721 |
|
1722 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1723 | // that allows for the reduced degree of freedom entailed in calculating
|
1724 | // values from samples rather than complete populations.
|
1725 | var besselsCorrection = x.length - 1;
|
1726 |
|
1727 | // Find the mean value of that list
|
1728 | var theSampleStandardDeviation = Math.sqrt(
|
1729 | sumSquaredDeviations / besselsCorrection
|
1730 | );
|
1731 |
|
1732 | var n = x.length;
|
1733 | var cubedS = Math.pow(theSampleStandardDeviation, 3);
|
1734 |
|
1735 | return (n * sumCubedDeviations) / ((n - 1) * (n - 2) * cubedS);
|
1736 | }
|
1737 |
|
1738 | /**
|
1739 | * [Kurtosis](http://en.wikipedia.org/wiki/Kurtosis) is
|
1740 | * a measure of the heaviness of a distribution's tails relative to its
|
1741 | * variance. The kurtosis value can be positive or negative, or even undefined.
|
1742 | *
|
1743 | * Implementation is based on Fisher's excess kurtosis definition and uses
|
1744 | * unbiased moment estimators. This is the version found in Excel and available
|
1745 | * in several statistical packages, including SAS and SciPy.
|
1746 | *
|
1747 | * @param {Array<number>} x a sample of 4 or more data points
|
1748 | * @returns {number} sample kurtosis
|
1749 | * @throws {Error} if x has length less than 4
|
1750 | * @example
|
1751 | * sampleKurtosis([1, 2, 2, 3, 5]); // => 1.4555765595463122
|
1752 | */
|
1753 | function sampleKurtosis(x) {
|
1754 | var n = x.length;
|
1755 |
|
1756 | if (n < 4) {
|
1757 | throw new Error("sampleKurtosis requires at least four data points");
|
1758 | }
|
1759 |
|
1760 | var meanValue = mean(x);
|
1761 | var tempValue;
|
1762 | var secondCentralMoment = 0;
|
1763 | var fourthCentralMoment = 0;
|
1764 |
|
1765 | for (var i = 0; i < n; i++) {
|
1766 | tempValue = x[i] - meanValue;
|
1767 | secondCentralMoment += tempValue * tempValue;
|
1768 | fourthCentralMoment += tempValue * tempValue * tempValue * tempValue;
|
1769 | }
|
1770 |
|
1771 | return (
|
1772 | ((n - 1) / ((n - 2) * (n - 3))) *
|
1773 | ((n * (n + 1) * fourthCentralMoment) /
|
1774 | (secondCentralMoment * secondCentralMoment) -
|
1775 | 3 * (n - 1))
|
1776 | );
|
1777 | }
|
1778 |
|
1779 | /**
|
1780 | * Implementation of [Heap's Algorithm](https://en.wikipedia.org/wiki/Heap%27s_algorithm)
|
1781 | * for generating permutations.
|
1782 | *
|
1783 | * @param {Array} elements any type of data
|
1784 | * @returns {Array<Array>} array of permutations
|
1785 | */
|
1786 | function permutationsHeap(elements) {
|
1787 | var indexes = new Array(elements.length);
|
1788 | var permutations = [elements.slice()];
|
1789 |
|
1790 | for (var i = 0; i < elements.length; i++) {
|
1791 | indexes[i] = 0;
|
1792 | }
|
1793 |
|
1794 | for (var i$1 = 0; i$1 < elements.length; ) {
|
1795 | if (indexes[i$1] < i$1) {
|
1796 | // At odd indexes, swap from indexes[i] instead
|
1797 | // of from the beginning of the array
|
1798 | var swapFrom = 0;
|
1799 | if (i$1 % 2 !== 0) {
|
1800 | swapFrom = indexes[i$1];
|
1801 | }
|
1802 |
|
1803 | // swap between swapFrom and i, using
|
1804 | // a temporary variable as storage.
|
1805 | var temp = elements[swapFrom];
|
1806 | elements[swapFrom] = elements[i$1];
|
1807 | elements[i$1] = temp;
|
1808 |
|
1809 | permutations.push(elements.slice());
|
1810 | indexes[i$1]++;
|
1811 | i$1 = 0;
|
1812 | } else {
|
1813 | indexes[i$1] = 0;
|
1814 | i$1++;
|
1815 | }
|
1816 | }
|
1817 |
|
1818 | return permutations;
|
1819 | }
|
1820 |
|
1821 | /**
|
1822 | * Implementation of Combinations
|
1823 | * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
|
1824 | * https://en.wikipedia.org/wiki/Combination
|
1825 | * @param {Array} x any type of data
|
1826 | * @param {int} k the number of objects in each group (without replacement)
|
1827 | * @returns {Array<Array>} array of permutations
|
1828 | * @example
|
1829 | * combinations([1, 2, 3], 2); // => [[1,2], [1,3], [2,3]]
|
1830 | */
|
1831 |
|
1832 | function combinations(x, k) {
|
1833 | var i;
|
1834 | var subI;
|
1835 | var combinationList = [];
|
1836 | var subsetCombinations;
|
1837 | var next;
|
1838 |
|
1839 | for (i = 0; i < x.length; i++) {
|
1840 | if (k === 1) {
|
1841 | combinationList.push([x[i]]);
|
1842 | } else {
|
1843 | subsetCombinations = combinations(x.slice(i + 1, x.length), k - 1);
|
1844 | for (subI = 0; subI < subsetCombinations.length; subI++) {
|
1845 | next = subsetCombinations[subI];
|
1846 | next.unshift(x[i]);
|
1847 | combinationList.push(next);
|
1848 | }
|
1849 | }
|
1850 | }
|
1851 | return combinationList;
|
1852 | }
|
1853 |
|
1854 | /**
|
1855 | * Implementation of [Combinations](https://en.wikipedia.org/wiki/Combination) with replacement
|
1856 | * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
|
1857 | * 'With replacement' means that a given element can be chosen multiple times.
|
1858 | * Unlike permutation, order doesn't matter for combinations.
|
1859 | *
|
1860 | * @param {Array} x any type of data
|
1861 | * @param {int} k the number of objects in each group (without replacement)
|
1862 | * @returns {Array<Array>} array of permutations
|
1863 | * @example
|
1864 | * combinationsReplacement([1, 2], 2); // => [[1, 1], [1, 2], [2, 2]]
|
1865 | */
|
1866 | function combinationsReplacement(x, k) {
|
1867 | var combinationList = [];
|
1868 |
|
1869 | for (var i = 0; i < x.length; i++) {
|
1870 | if (k === 1) {
|
1871 | // If we're requested to find only one element, we don't need
|
1872 | // to recurse: just push `x[i]` onto the list of combinations.
|
1873 | combinationList.push([x[i]]);
|
1874 | } else {
|
1875 | // Otherwise, recursively find combinations, given `k - 1`. Note that
|
1876 | // we request `k - 1`, so if you were looking for k=3 combinations, we're
|
1877 | // requesting k=2. This -1 gets reversed in the for loop right after this
|
1878 | // code, since we concatenate `x[i]` onto the selected combinations,
|
1879 | // bringing `k` back up to your requested level.
|
1880 | // This recursion may go many levels deep, since it only stops once
|
1881 | // k=1.
|
1882 | var subsetCombinations = combinationsReplacement(
|
1883 | x.slice(i, x.length),
|
1884 | k - 1
|
1885 | );
|
1886 |
|
1887 | for (var j = 0; j < subsetCombinations.length; j++) {
|
1888 | combinationList.push([x[i]].concat(subsetCombinations[j]));
|
1889 | }
|
1890 | }
|
1891 | }
|
1892 |
|
1893 | return combinationList;
|
1894 | }
|
1895 |
|
1896 | /**
|
1897 | * When adding a new value to a list, one does not have to necessary
|
1898 | * recompute the mean of the list in linear time. They can instead use
|
1899 | * this function to compute the new mean by providing the current mean,
|
1900 | * the number of elements in the list that produced it and the new
|
1901 | * value to add.
|
1902 | *
|
1903 | * @since 2.5.0
|
1904 | * @param {number} mean current mean
|
1905 | * @param {number} n number of items in the list
|
1906 | * @param {number} newValue the added value
|
1907 | * @returns {number} the new mean
|
1908 | *
|
1909 | * @example
|
1910 | * addToMean(14, 5, 53); // => 20.5
|
1911 | */
|
1912 | function addToMean(mean, n, newValue) {
|
1913 | return mean + (newValue - mean) / (n + 1);
|
1914 | }
|
1915 |
|
1916 | /**
|
1917 | * When combining two lists of values for which one already knows the means,
|
1918 | * one does not have to necessary recompute the mean of the combined lists in
|
1919 | * linear time. They can instead use this function to compute the combined
|
1920 | * mean by providing the mean & number of values of the first list and the mean
|
1921 | * & number of values of the second list.
|
1922 | *
|
1923 | * @since 3.0.0
|
1924 | * @param {number} mean1 mean of the first list
|
1925 | * @param {number} n1 number of items in the first list
|
1926 | * @param {number} mean2 mean of the second list
|
1927 | * @param {number} n2 number of items in the second list
|
1928 | * @returns {number} the combined mean
|
1929 | *
|
1930 | * @example
|
1931 | * combineMeans(5, 3, 4, 3); // => 4.5
|
1932 | */
|
1933 | function combineMeans(mean1, n1, mean2, n2) {
|
1934 | return (mean1 * n1 + mean2 * n2) / (n1 + n2);
|
1935 | }
|
1936 |
|
1937 | /**
|
1938 | * When combining two lists of values for which one already knows the variances,
|
1939 | * one does not have to necessary recompute the variance of the combined lists
|
1940 | * in linear time. They can instead use this function to compute the combined
|
1941 | * variance by providing the variance, mean & number of values of the first list
|
1942 | * and the variance, mean & number of values of the second list.
|
1943 | *
|
1944 | * @since 3.0.0
|
1945 | * @param {number} variance1 variance of the first list
|
1946 | * @param {number} mean1 mean of the first list
|
1947 | * @param {number} n1 number of items in the first list
|
1948 | * @param {number} variance2 variance of the second list
|
1949 | * @param {number} mean2 mean of the second list
|
1950 | * @param {number} n2 number of items in the second list
|
1951 | * @returns {number} the combined mean
|
1952 | *
|
1953 | * @example
|
1954 | * combineVariances(14 / 3, 5, 3, 8 / 3, 4, 3); // => 47 / 12
|
1955 | */
|
1956 | function combineVariances(variance1, mean1, n1, variance2, mean2, n2) {
|
1957 | var newMean = combineMeans(mean1, n1, mean2, n2);
|
1958 |
|
1959 | return (
|
1960 | (n1 * (variance1 + Math.pow(mean1 - newMean, 2)) +
|
1961 | n2 * (variance2 + Math.pow(mean2 - newMean, 2))) /
|
1962 | (n1 + n2)
|
1963 | );
|
1964 | }
|
1965 |
|
1966 | /**
|
1967 | * The [Geometric Mean](https://en.wikipedia.org/wiki/Geometric_mean) is
|
1968 | * a mean function that is more useful for numbers in different
|
1969 | * ranges.
|
1970 | *
|
1971 | * This is the nth root of the input numbers multiplied by each other.
|
1972 | *
|
1973 | * The geometric mean is often useful for
|
1974 | * **[proportional growth](https://en.wikipedia.org/wiki/Geometric_mean#Proportional_growth)**: given
|
1975 | * growth rates for multiple years, like _80%, 16.66% and 42.85%_, a simple
|
1976 | * mean will incorrectly estimate an average growth rate, whereas a geometric
|
1977 | * mean will correctly estimate a growth rate that, over those years,
|
1978 | * will yield the same end value.
|
1979 | *
|
1980 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
1981 | *
|
1982 | * @param {Array<number>} x sample of one or more data points
|
1983 | * @returns {number} geometric mean
|
1984 | * @throws {Error} if x is empty
|
1985 | * @throws {Error} if x contains a negative number
|
1986 | * @example
|
1987 | * var growthRates = [1.80, 1.166666, 1.428571];
|
1988 | * var averageGrowth = ss.geometricMean(growthRates);
|
1989 | * var averageGrowthRates = [averageGrowth, averageGrowth, averageGrowth];
|
1990 | * var startingValue = 10;
|
1991 | * var startingValueMean = 10;
|
1992 | * growthRates.forEach(function(rate) {
|
1993 | * startingValue *= rate;
|
1994 | * });
|
1995 | * averageGrowthRates.forEach(function(rate) {
|
1996 | * startingValueMean *= rate;
|
1997 | * });
|
1998 | * startingValueMean === startingValue;
|
1999 | */
|
2000 | function geometricMean(x) {
|
2001 | if (x.length === 0) {
|
2002 | throw new Error("geometricMean requires at least one data point");
|
2003 | }
|
2004 |
|
2005 | // the starting value.
|
2006 | var value = 1;
|
2007 |
|
2008 | for (var i = 0; i < x.length; i++) {
|
2009 | // the geometric mean is only valid for positive numbers
|
2010 | if (x[i] <= 0) {
|
2011 | throw new Error(
|
2012 | "geometricMean requires only positive numbers as input"
|
2013 | );
|
2014 | }
|
2015 |
|
2016 | // repeatedly multiply the value by each number
|
2017 | value *= x[i];
|
2018 | }
|
2019 |
|
2020 | return Math.pow(value, 1 / x.length);
|
2021 | }
|
2022 |
|
2023 | /**
|
2024 | * The [Harmonic Mean](https://en.wikipedia.org/wiki/Harmonic_mean) is
|
2025 | * a mean function typically used to find the average of rates.
|
2026 | * This mean is calculated by taking the reciprocal of the arithmetic mean
|
2027 | * of the reciprocals of the input numbers.
|
2028 | *
|
2029 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2030 | * a method of finding a typical or central value of a set of numbers.
|
2031 | *
|
2032 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2033 | *
|
2034 | * @param {Array<number>} x sample of one or more data points
|
2035 | * @returns {number} harmonic mean
|
2036 | * @throws {Error} if x is empty
|
2037 | * @throws {Error} if x contains a negative number
|
2038 | * @example
|
2039 | * harmonicMean([2, 3]).toFixed(2) // => '2.40'
|
2040 | */
|
2041 | function harmonicMean(x) {
|
2042 | if (x.length === 0) {
|
2043 | throw new Error("harmonicMean requires at least one data point");
|
2044 | }
|
2045 |
|
2046 | var reciprocalSum = 0;
|
2047 |
|
2048 | for (var i = 0; i < x.length; i++) {
|
2049 | // the harmonic mean is only valid for positive numbers
|
2050 | if (x[i] <= 0) {
|
2051 | throw new Error(
|
2052 | "harmonicMean requires only positive numbers as input"
|
2053 | );
|
2054 | }
|
2055 |
|
2056 | reciprocalSum += 1 / x[i];
|
2057 | }
|
2058 |
|
2059 | // divide n by the the reciprocal sum
|
2060 | return x.length / reciprocalSum;
|
2061 | }
|
2062 |
|
2063 | /**
|
2064 | * The mean, _also known as average_,
|
2065 | * is the sum of all values over the number of values.
|
2066 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2067 | * a method of finding a typical or central value of a set of numbers.
|
2068 | *
|
2069 | * The simple mean uses the successive addition method internally
|
2070 | * to calculate it's result. Errors in floating-point addition are
|
2071 | * not accounted for, so if precision is required, the standard {@link mean}
|
2072 | * method should be used instead.
|
2073 | *
|
2074 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2075 | *
|
2076 | *
|
2077 | * @param {Array<number>} x sample of one or more data points
|
2078 | * @throws {Error} if the the length of x is less than one
|
2079 | * @returns {number} mean
|
2080 | * @example
|
2081 | * mean([0, 10]); // => 5
|
2082 | */
|
2083 | function meanSimple(x) {
|
2084 | if (x.length === 0) {
|
2085 | throw new Error("meanSimple requires at least one data point");
|
2086 | }
|
2087 |
|
2088 | return sumSimple(x) / x.length;
|
2089 | }
|
2090 |
|
2091 | /**
|
2092 | * The [median](http://en.wikipedia.org/wiki/Median) is
|
2093 | * the middle number of a list. This is often a good indicator of 'the middle'
|
2094 | * when there are outliers that skew the `mean()` value.
|
2095 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2096 | * a method of finding a typical or central value of a set of numbers.
|
2097 | *
|
2098 | * The median isn't necessarily one of the elements in the list: the value
|
2099 | * can be the average of two elements if the list has an even length
|
2100 | * and the two central values are different.
|
2101 | *
|
2102 | * @param {Array<number>} sorted input
|
2103 | * @returns {number} median value
|
2104 | * @example
|
2105 | * medianSorted([10, 2, 5, 100, 2, 1]); // => 52.5
|
2106 | */
|
2107 | function medianSorted(sorted) {
|
2108 | return quantileSorted(sorted, 0.5);
|
2109 | }
|
2110 |
|
2111 | /**
|
2112 | * When removing a value from a list, one does not have to necessary
|
2113 | * recompute the mean of the list in linear time. They can instead use
|
2114 | * this function to compute the new mean by providing the current mean,
|
2115 | * the number of elements in the list that produced it and the value to remove.
|
2116 | *
|
2117 | * @since 3.0.0
|
2118 | * @param {number} mean current mean
|
2119 | * @param {number} n number of items in the list
|
2120 | * @param {number} value the value to remove
|
2121 | * @returns {number} the new mean
|
2122 | *
|
2123 | * @example
|
2124 | * subtractFromMean(20.5, 6, 53); // => 14
|
2125 | */
|
2126 | function subtractFromMean(mean, n, value) {
|
2127 | return (mean * n - value) / (n - 1);
|
2128 | }
|
2129 |
|
2130 | /**
|
2131 | * The Root Mean Square (RMS) is
|
2132 | * a mean function used as a measure of the magnitude of a set
|
2133 | * of numbers, regardless of their sign.
|
2134 | * This is the square root of the mean of the squares of the
|
2135 | * input numbers.
|
2136 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2137 | *
|
2138 | * @param {Array<number>} x a sample of one or more data points
|
2139 | * @returns {number} root mean square
|
2140 | * @throws {Error} if x is empty
|
2141 | * @example
|
2142 | * rootMeanSquare([-1, 1, -1, 1]); // => 1
|
2143 | */
|
2144 | function rootMeanSquare(x) {
|
2145 | if (x.length === 0) {
|
2146 | throw new Error("rootMeanSquare requires at least one data point");
|
2147 | }
|
2148 |
|
2149 | var sumOfSquares = 0;
|
2150 | for (var i = 0; i < x.length; i++) {
|
2151 | sumOfSquares += Math.pow(x[i], 2);
|
2152 | }
|
2153 |
|
2154 | return Math.sqrt(sumOfSquares / x.length);
|
2155 | }
|
2156 |
|
2157 | /**
|
2158 | * This is to compute [a one-sample t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#One-sample_t-test), comparing the mean
|
2159 | * of a sample to a known value, x.
|
2160 | *
|
2161 | * in this case, we're trying to determine whether the
|
2162 | * population mean is equal to the value that we know, which is `x`
|
2163 | * here. usually the results here are used to look up a
|
2164 | * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
|
2165 | * a certain level of significance, will let you determine that the
|
2166 | * null hypothesis can or cannot be rejected.
|
2167 | *
|
2168 | * @param {Array<number>} x sample of one or more numbers
|
2169 | * @param {number} expectedValue expected value of the population mean
|
2170 | * @returns {number} value
|
2171 | * @example
|
2172 | * tTest([1, 2, 3, 4, 5, 6], 3.385).toFixed(2); // => '0.16'
|
2173 | */
|
2174 | function tTest(x, expectedValue) {
|
2175 | // The mean of the sample
|
2176 | var sampleMean = mean(x);
|
2177 |
|
2178 | // The standard deviation of the sample
|
2179 | var sd = standardDeviation(x);
|
2180 |
|
2181 | // Square root the length of the sample
|
2182 | var rootN = Math.sqrt(x.length);
|
2183 |
|
2184 | // returning the t value
|
2185 | return (sampleMean - expectedValue) / (sd / rootN);
|
2186 | }
|
2187 |
|
2188 | /**
|
2189 | * This is to compute [two sample t-test](http://en.wikipedia.org/wiki/Student's_t-test).
|
2190 | * Tests whether "mean(X)-mean(Y) = difference", (
|
2191 | * in the most common case, we often have `difference == 0` to test if two samples
|
2192 | * are likely to be taken from populations with the same mean value) with
|
2193 | * no prior knowledge on standard deviations of both samples
|
2194 | * other than the fact that they have the same standard deviation.
|
2195 | *
|
2196 | * Usually the results here are used to look up a
|
2197 | * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
|
2198 | * a certain level of significance, will let you determine that the
|
2199 | * null hypothesis can or cannot be rejected.
|
2200 | *
|
2201 | * `diff` can be omitted if it equals 0.
|
2202 | *
|
2203 | * [This is used to confirm or deny](http://www.monarchlab.org/Lab/Research/Stats/2SampleT.aspx)
|
2204 | * a null hypothesis that the two populations that have been sampled into
|
2205 | * `sampleX` and `sampleY` are equal to each other.
|
2206 | *
|
2207 | * @param {Array<number>} sampleX a sample as an array of numbers
|
2208 | * @param {Array<number>} sampleY a sample as an array of numbers
|
2209 | * @param {number} [difference=0]
|
2210 | * @returns {number|null} test result
|
2211 | *
|
2212 | * @example
|
2213 | * tTestTwoSample([1, 2, 3, 4], [3, 4, 5, 6], 0); // => -2.1908902300206643
|
2214 | */
|
2215 | function tTestTwoSample(sampleX, sampleY, difference) {
|
2216 | var n = sampleX.length;
|
2217 | var m = sampleY.length;
|
2218 |
|
2219 | // If either sample doesn't actually have any values, we can't
|
2220 | // compute this at all, so we return `null`.
|
2221 | if (!n || !m) {
|
2222 | return null;
|
2223 | }
|
2224 |
|
2225 | // default difference (mu) is zero
|
2226 | if (!difference) {
|
2227 | difference = 0;
|
2228 | }
|
2229 |
|
2230 | var meanX = mean(sampleX);
|
2231 | var meanY = mean(sampleY);
|
2232 | var sampleVarianceX = sampleVariance(sampleX);
|
2233 | var sampleVarianceY = sampleVariance(sampleY);
|
2234 |
|
2235 | if (
|
2236 | typeof meanX === "number" &&
|
2237 | typeof meanY === "number" &&
|
2238 | typeof sampleVarianceX === "number" &&
|
2239 | typeof sampleVarianceY === "number"
|
2240 | ) {
|
2241 | var weightedVariance =
|
2242 | ((n - 1) * sampleVarianceX + (m - 1) * sampleVarianceY) /
|
2243 | (n + m - 2);
|
2244 |
|
2245 | return (
|
2246 | (meanX - meanY - difference) /
|
2247 | Math.sqrt(weightedVariance * (1 / n + 1 / m))
|
2248 | );
|
2249 | }
|
2250 | }
|
2251 |
|
2252 | /**
|
2253 | * [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
2254 | *
|
2255 | * This is a naïve bayesian classifier that takes
|
2256 | * singly-nested objects.
|
2257 | *
|
2258 | * @class
|
2259 | * @example
|
2260 | * var bayes = new BayesianClassifier();
|
2261 | * bayes.train({
|
2262 | * species: 'Cat'
|
2263 | * }, 'animal');
|
2264 | * var result = bayes.score({
|
2265 | * species: 'Cat'
|
2266 | * })
|
2267 | * // result
|
2268 | * // {
|
2269 | * // animal: 1
|
2270 | * // }
|
2271 | */
|
2272 | var BayesianClassifier = function BayesianClassifier() {
|
2273 | // The number of items that are currently
|
2274 | // classified in the model
|
2275 | this.totalCount = 0;
|
2276 | // Every item classified in the model
|
2277 | this.data = {};
|
2278 | };
|
2279 |
|
2280 | /**
|
2281 | * Train the classifier with a new item, which has a single
|
2282 | * dimension of Javascript literal keys and values.
|
2283 | *
|
2284 | * @param {Object} item an object with singly-deep properties
|
2285 | * @param {string} category the category this item belongs to
|
2286 | * @return {undefined} adds the item to the classifier
|
2287 | */
|
2288 | BayesianClassifier.prototype.train = function train (item, category) {
|
2289 | // If the data object doesn't have any values
|
2290 | // for this category, create a new object for it.
|
2291 | if (!this.data[category]) {
|
2292 | this.data[category] = {};
|
2293 | }
|
2294 |
|
2295 | // Iterate through each key in the item.
|
2296 | for (var k in item) {
|
2297 | var v = item[k];
|
2298 | // Initialize the nested object `data[category][k][item[k]]`
|
2299 | // with an object of keys that equal 0.
|
2300 | if (this.data[category][k] === undefined) {
|
2301 | this.data[category][k] = {};
|
2302 | }
|
2303 | if (this.data[category][k][v] === undefined) {
|
2304 | this.data[category][k][v] = 0;
|
2305 | }
|
2306 |
|
2307 | // And increment the key for this key/value combination.
|
2308 | this.data[category][k][v]++;
|
2309 | }
|
2310 |
|
2311 | // Increment the number of items classified
|
2312 | this.totalCount++;
|
2313 | };
|
2314 |
|
2315 | /**
|
2316 | * Generate a score of how well this item matches all
|
2317 | * possible categories based on its attributes
|
2318 | *
|
2319 | * @param {Object} item an item in the same format as with train
|
2320 | * @returns {Object} of probabilities that this item belongs to a
|
2321 | * given category.
|
2322 | */
|
2323 | BayesianClassifier.prototype.score = function score (item) {
|
2324 | // Initialize an empty array of odds per category.
|
2325 | var odds = {};
|
2326 | var category;
|
2327 | // Iterate through each key in the item,
|
2328 | // then iterate through each category that has been used
|
2329 | // in previous calls to `.train()`
|
2330 | for (var k in item) {
|
2331 | var v = item[k];
|
2332 | for (category in this.data) {
|
2333 | // Create an empty object for storing key - value combinations
|
2334 | // for this category.
|
2335 | odds[category] = {};
|
2336 |
|
2337 | // If this item doesn't even have a property, it counts for nothing,
|
2338 | // but if it does have the property that we're looking for from
|
2339 | // the item to categorize, it counts based on how popular it is
|
2340 | // versus the whole population.
|
2341 | if (this.data[category][k]) {
|
2342 | odds[category][k + "_" + v] =
|
2343 | (this.data[category][k][v] || 0) / this.totalCount;
|
2344 | } else {
|
2345 | odds[category][k + "_" + v] = 0;
|
2346 | }
|
2347 | }
|
2348 | }
|
2349 |
|
2350 | // Set up a new object that will contain sums of these odds by category
|
2351 | var oddsSums = {};
|
2352 |
|
2353 | for (category in odds) {
|
2354 | // Tally all of the odds for each category-combination pair -
|
2355 | // the non-existence of a category does not add anything to the
|
2356 | // score.
|
2357 | oddsSums[category] = 0;
|
2358 | for (var combination in odds[category]) {
|
2359 | oddsSums[category] += odds[category][combination];
|
2360 | }
|
2361 | }
|
2362 |
|
2363 | return oddsSums;
|
2364 | };
|
2365 |
|
2366 | /**
|
2367 | * This is a single-layer [Perceptron Classifier](http://en.wikipedia.org/wiki/Perceptron) that takes
|
2368 | * arrays of numbers and predicts whether they should be classified
|
2369 | * as either 0 or 1 (negative or positive examples).
|
2370 | * @class
|
2371 | * @example
|
2372 | * // Create the model
|
2373 | * var p = new PerceptronModel();
|
2374 | * // Train the model with input with a diagonal boundary.
|
2375 | * for (var i = 0; i < 5; i++) {
|
2376 | * p.train([1, 1], 1);
|
2377 | * p.train([0, 1], 0);
|
2378 | * p.train([1, 0], 0);
|
2379 | * p.train([0, 0], 0);
|
2380 | * }
|
2381 | * p.predict([0, 0]); // 0
|
2382 | * p.predict([0, 1]); // 0
|
2383 | * p.predict([1, 0]); // 0
|
2384 | * p.predict([1, 1]); // 1
|
2385 | */
|
2386 | var PerceptronModel = function PerceptronModel() {
|
2387 | // The weights, or coefficients of the model;
|
2388 | // weights are only populated when training with data.
|
2389 | this.weights = [];
|
2390 | // The bias term, or intercept; it is also a weight but
|
2391 | // it's stored separately for convenience as it is always
|
2392 | // multiplied by one.
|
2393 | this.bias = 0;
|
2394 | };
|
2395 | /**
|
2396 | * **Predict**: Use an array of features with the weight array and bias
|
2397 | * to predict whether an example is labeled 0 or 1.
|
2398 | *
|
2399 | * @param {Array<number>} features an array of features as numbers
|
2400 | * @returns {number} 1 if the score is over 0, otherwise 0
|
2401 | */
|
2402 | PerceptronModel.prototype.predict = function predict (features) {
|
2403 | // Only predict if previously trained
|
2404 | // on the same size feature array(s).
|
2405 | if (features.length !== this.weights.length) {
|
2406 | return null;
|
2407 | }
|
2408 |
|
2409 | // Calculate the sum of features times weights,
|
2410 | // with the bias added (implicitly times one).
|
2411 | var score = 0;
|
2412 | for (var i = 0; i < this.weights.length; i++) {
|
2413 | score += this.weights[i] * features[i];
|
2414 | }
|
2415 | score += this.bias;
|
2416 |
|
2417 | // Classify as 1 if the score is over 0, otherwise 0.
|
2418 | if (score > 0) {
|
2419 | return 1;
|
2420 | } else {
|
2421 | return 0;
|
2422 | }
|
2423 | };
|
2424 |
|
2425 | /**
|
2426 | * **Train** the classifier with a new example, which is
|
2427 | * a numeric array of features and a 0 or 1 label.
|
2428 | *
|
2429 | * @param {Array<number>} features an array of features as numbers
|
2430 | * @param {number} label either 0 or 1
|
2431 | * @returns {PerceptronModel} this
|
2432 | */
|
2433 | PerceptronModel.prototype.train = function train (features, label) {
|
2434 | // Require that only labels of 0 or 1 are considered.
|
2435 | if (label !== 0 && label !== 1) {
|
2436 | return null;
|
2437 | }
|
2438 | // The length of the feature array determines
|
2439 | // the length of the weight array.
|
2440 | // The perceptron will continue learning as long as
|
2441 | // it keeps seeing feature arrays of the same length.
|
2442 | // When it sees a new data shape, it initializes.
|
2443 | if (features.length !== this.weights.length) {
|
2444 | this.weights = features;
|
2445 | this.bias = 1;
|
2446 | }
|
2447 | // Make a prediction based on current weights.
|
2448 | var prediction = this.predict(features);
|
2449 | // Update the weights if the prediction is wrong.
|
2450 | if (typeof prediction === "number" && prediction !== label) {
|
2451 | var gradient = label - prediction;
|
2452 | for (var i = 0; i < this.weights.length; i++) {
|
2453 | this.weights[i] += gradient * features[i];
|
2454 | }
|
2455 | this.bias += gradient;
|
2456 | }
|
2457 | return this;
|
2458 | };
|
2459 |
|
2460 | /**
|
2461 | * We use `ε`, epsilon, as a stopping criterion when we want to iterate
|
2462 | * until we're "close enough". Epsilon is a very small number: for
|
2463 | * simple statistics, that number is **0.0001**
|
2464 | *
|
2465 | * This is used in calculations like the binomialDistribution, in which
|
2466 | * the process of finding a value is [iterative](https://en.wikipedia.org/wiki/Iterative_method):
|
2467 | * it progresses until it is close enough.
|
2468 | *
|
2469 | * Below is an example of using epsilon in [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent),
|
2470 | * where we're trying to find a local minimum of a function's derivative,
|
2471 | * given by the `fDerivative` method.
|
2472 | *
|
2473 | * @example
|
2474 | * // From calculation, we expect that the local minimum occurs at x=9/4
|
2475 | * var x_old = 0;
|
2476 | * // The algorithm starts at x=6
|
2477 | * var x_new = 6;
|
2478 | * var stepSize = 0.01;
|
2479 | *
|
2480 | * function fDerivative(x) {
|
2481 | * return 4 * Math.pow(x, 3) - 9 * Math.pow(x, 2);
|
2482 | * }
|
2483 | *
|
2484 | * // The loop runs until the difference between the previous
|
2485 | * // value and the current value is smaller than epsilon - a rough
|
2486 | * // meaure of 'close enough'
|
2487 | * while (Math.abs(x_new - x_old) > ss.epsilon) {
|
2488 | * x_old = x_new;
|
2489 | * x_new = x_old - stepSize * fDerivative(x_old);
|
2490 | * }
|
2491 | *
|
2492 | * console.log('Local minimum occurs at', x_new);
|
2493 | */
|
2494 | var epsilon = 0.0001;
|
2495 |
|
2496 | /**
|
2497 | * A [Factorial](https://en.wikipedia.org/wiki/Factorial), usually written n!, is the product of all positive
|
2498 | * integers less than or equal to n. Often factorial is implemented
|
2499 | * recursively, but this iterative approach is significantly faster
|
2500 | * and simpler.
|
2501 | *
|
2502 | * @param {number} n input, must be an integer number 1 or greater
|
2503 | * @returns {number} factorial: n!
|
2504 | * @throws {Error} if n is less than 0 or not an integer
|
2505 | * @example
|
2506 | * factorial(5); // => 120
|
2507 | */
|
2508 | function factorial(n) {
|
2509 | // factorial is mathematically undefined for negative numbers
|
2510 | if (n < 0) {
|
2511 | throw new Error("factorial requires a non-negative value");
|
2512 | }
|
2513 |
|
2514 | if (Math.floor(n) !== n) {
|
2515 | throw new Error("factorial requires an integer input");
|
2516 | }
|
2517 |
|
2518 | // typically you'll expand the factorial function going down, like
|
2519 | // 5! = 5 * 4 * 3 * 2 * 1. This is going in the opposite direction,
|
2520 | // counting from 2 up to the number in question, and since anything
|
2521 | // multiplied by 1 is itself, the loop only needs to start at 2.
|
2522 | var accumulator = 1;
|
2523 | for (var i = 2; i <= n; i++) {
|
2524 | // for each number up to and including the number `n`, multiply
|
2525 | // the accumulator my that number.
|
2526 | accumulator *= i;
|
2527 | }
|
2528 | return accumulator;
|
2529 | }
|
2530 |
|
2531 | /**
|
2532 | * Compute the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Nemes' approximation.
|
2533 | * The gamma of n is equivalent to (n-1)!, but unlike the factorial function, gamma is defined for all real n except zero
|
2534 | * and negative integers (where NaN is returned). Note, the gamma function is also well-defined for complex numbers,
|
2535 | * though this implementation currently does not handle complex numbers as input values.
|
2536 | * Nemes' approximation is defined [here](https://arxiv.org/abs/1003.6020) as Theorem 2.2.
|
2537 | * Negative values use [Euler's reflection formula](https://en.wikipedia.org/wiki/Gamma_function#Properties) for computation.
|
2538 | *
|
2539 | * @param {number} n Any real number except for zero and negative integers.
|
2540 | * @returns {number} The gamma of the input value.
|
2541 | *
|
2542 | * @example
|
2543 | * gamma(11.5); // 11899423.084037038
|
2544 | * gamma(-11.5); // 2.29575810481609e-8
|
2545 | * gamma(5); // 24
|
2546 | */
|
2547 | function gamma(n) {
|
2548 | if (Number.isInteger(n)) {
|
2549 | if (n <= 0) {
|
2550 | // gamma not defined for zero or negative integers
|
2551 | return NaN;
|
2552 | } else {
|
2553 | // use factorial for integer inputs
|
2554 | return factorial(n - 1);
|
2555 | }
|
2556 | }
|
2557 |
|
2558 | // Decrement n, because approximation is defined for n - 1
|
2559 | n--;
|
2560 |
|
2561 | if (n < 0) {
|
2562 | // Use Euler's reflection formula for negative inputs
|
2563 | // see: https://en.wikipedia.org/wiki/Gamma_function#Properties
|
2564 | return Math.PI / (Math.sin(Math.PI * -n) * gamma(-n));
|
2565 | } else {
|
2566 | // Nemes' expansion approximation
|
2567 | var seriesCoefficient =
|
2568 | Math.pow(n / Math.E, n) * Math.sqrt(2 * Math.PI * (n + 1 / 6));
|
2569 |
|
2570 | var seriesDenom = n + 1 / 4;
|
2571 |
|
2572 | var seriesExpansion =
|
2573 | 1 +
|
2574 | 1 / 144 / Math.pow(seriesDenom, 2) -
|
2575 | 1 / 12960 / Math.pow(seriesDenom, 3) -
|
2576 | 257 / 207360 / Math.pow(seriesDenom, 4) -
|
2577 | 52 / 2612736 / Math.pow(seriesDenom, 5) +
|
2578 | 5741173 / 9405849600 / Math.pow(seriesDenom, 6) +
|
2579 | 37529 / 18811699200 / Math.pow(seriesDenom, 7);
|
2580 |
|
2581 | return seriesCoefficient * seriesExpansion;
|
2582 | }
|
2583 | }
|
2584 |
|
2585 | // Define series coefficients
|
2586 | var COEFFICIENTS = [
|
2587 | 0.99999999999999709182,
|
2588 | 57.156235665862923517,
|
2589 | -59.597960355475491248,
|
2590 | 14.136097974741747174,
|
2591 | -0.49191381609762019978,
|
2592 | 0.33994649984811888699e-4,
|
2593 | 0.46523628927048575665e-4,
|
2594 | -0.98374475304879564677e-4,
|
2595 | 0.15808870322491248884e-3,
|
2596 | -0.21026444172410488319e-3,
|
2597 | 0.2174396181152126432e-3,
|
2598 | -0.16431810653676389022e-3,
|
2599 | 0.84418223983852743293e-4,
|
2600 | -0.2619083840158140867e-4,
|
2601 | 0.36899182659531622704e-5
|
2602 | ];
|
2603 |
|
2604 | var g = 607 / 128;
|
2605 | var LOGSQRT2PI = Math.log(Math.sqrt(2 * Math.PI));
|
2606 |
|
2607 | /**
|
2608 | * Compute the logarithm of the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Lanczos' approximation.
|
2609 | * This function takes as input any real-value n greater than 0.
|
2610 | * This function is useful for values of n too large for the normal gamma function (n > 165).
|
2611 | * The code is based on Lanczo's Gamma approximation, defined [here](http://my.fit.edu/~gabdo/gamma.txt).
|
2612 | *
|
2613 | * @param {number} n Any real number greater than zero.
|
2614 | * @returns {number} The logarithm of gamma of the input value.
|
2615 | *
|
2616 | * @example
|
2617 | * gammaln(500); // 2605.1158503617335
|
2618 | * gammaln(2.4); // 0.21685932244884043
|
2619 | */
|
2620 | function gammaln(n) {
|
2621 | // Return infinity if value not in domain
|
2622 | if (n <= 0) {
|
2623 | return Infinity;
|
2624 | }
|
2625 |
|
2626 | // Decrement n, because approximation is defined for n - 1
|
2627 | n--;
|
2628 |
|
2629 | // Create series approximation
|
2630 | var a = COEFFICIENTS[0];
|
2631 |
|
2632 | for (var i = 1; i < 15; i++) {
|
2633 | a += COEFFICIENTS[i] / (n + i);
|
2634 | }
|
2635 |
|
2636 | var tmp = g + 0.5 + n;
|
2637 |
|
2638 | // Return natural logarithm of gamma(n)
|
2639 | return LOGSQRT2PI + Math.log(a) - tmp + (n + 0.5) * Math.log(tmp);
|
2640 | }
|
2641 |
|
2642 | /**
|
2643 | * The [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution)
|
2644 | * is the probability discrete
|
2645 | * distribution of a random variable which takes value 1 with success
|
2646 | * probability `p` and value 0 with failure
|
2647 | * probability `q` = 1 - `p`. It can be used, for example, to represent the
|
2648 | * toss of a coin, where "1" is defined to mean "heads" and "0" is defined
|
2649 | * to mean "tails" (or vice versa). It is
|
2650 | * a special case of a Binomial Distribution
|
2651 | * where `n` = 1.
|
2652 | *
|
2653 | * @param {number} p input value, between 0 and 1 inclusive
|
2654 | * @returns {number[]} values of bernoulli distribution at this point
|
2655 | * @throws {Error} if p is outside 0 and 1
|
2656 | * @example
|
2657 | * bernoulliDistribution(0.3); // => [0.7, 0.3]
|
2658 | */
|
2659 | function bernoulliDistribution(p) /*: number[] */ {
|
2660 | // Check that `p` is a valid probability (0 ≤ p ≤ 1)
|
2661 | if (p < 0 || p > 1) {
|
2662 | throw new Error(
|
2663 | "bernoulliDistribution requires probability to be between 0 and 1 inclusive"
|
2664 | );
|
2665 | }
|
2666 |
|
2667 | return [1 - p, p];
|
2668 | }
|
2669 |
|
2670 | /**
|
2671 | * The [Binomial Distribution](http://en.wikipedia.org/wiki/Binomial_distribution) is the discrete probability
|
2672 | * distribution of the number of successes in a sequence of n independent yes/no experiments, each of which yields
|
2673 | * success with probability `probability`. Such a success/failure experiment is also called a Bernoulli experiment or
|
2674 | * Bernoulli trial; when trials = 1, the Binomial Distribution is a Bernoulli Distribution.
|
2675 | *
|
2676 | * @param {number} trials number of trials to simulate
|
2677 | * @param {number} probability
|
2678 | * @returns {number[]} output
|
2679 | */
|
2680 | function binomialDistribution(trials, probability) /*: ?number[] */ {
|
2681 | // Check that `p` is a valid probability (0 ≤ p ≤ 1),
|
2682 | // that `n` is an integer, strictly positive.
|
2683 | if (probability < 0 || probability > 1 || trials <= 0 || trials % 1 !== 0) {
|
2684 | return undefined;
|
2685 | }
|
2686 |
|
2687 | // We initialize `x`, the random variable, and `accumulator`, an accumulator
|
2688 | // for the cumulative distribution function to 0. `distribution_functions`
|
2689 | // is the object we'll return with the `probability_of_x` and the
|
2690 | // `cumulativeProbability_of_x`, as well as the calculated mean &
|
2691 | // variance. We iterate until the `cumulativeProbability_of_x` is
|
2692 | // within `epsilon` of 1.0.
|
2693 | var x = 0;
|
2694 | var cumulativeProbability = 0;
|
2695 | var cells = [];
|
2696 | var binomialCoefficient = 1;
|
2697 |
|
2698 | // This algorithm iterates through each potential outcome,
|
2699 | // until the `cumulativeProbability` is very close to 1, at
|
2700 | // which point we've defined the vast majority of outcomes
|
2701 | do {
|
2702 | // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
|
2703 | cells[x] =
|
2704 | binomialCoefficient *
|
2705 | Math.pow(probability, x) *
|
2706 | Math.pow(1 - probability, trials - x);
|
2707 | cumulativeProbability += cells[x];
|
2708 | x++;
|
2709 | binomialCoefficient = (binomialCoefficient * (trials - x + 1)) / x;
|
2710 | // when the cumulativeProbability is nearly 1, we've calculated
|
2711 | // the useful range of this distribution
|
2712 | } while (cumulativeProbability < 1 - epsilon);
|
2713 |
|
2714 | return cells;
|
2715 | }
|
2716 |
|
2717 | /**
|
2718 | * The [Poisson Distribution](http://en.wikipedia.org/wiki/Poisson_distribution)
|
2719 | * is a discrete probability distribution that expresses the probability
|
2720 | * of a given number of events occurring in a fixed interval of time
|
2721 | * and/or space if these events occur with a known average rate and
|
2722 | * independently of the time since the last event.
|
2723 | *
|
2724 | * The Poisson Distribution is characterized by the strictly positive
|
2725 | * mean arrival or occurrence rate, `λ`.
|
2726 | *
|
2727 | * @param {number} lambda location poisson distribution
|
2728 | * @returns {number[]} values of poisson distribution at that point
|
2729 | */
|
2730 | function poissonDistribution(lambda) /*: ?number[] */ {
|
2731 | // Check that lambda is strictly positive
|
2732 | if (lambda <= 0) {
|
2733 | return undefined;
|
2734 | }
|
2735 |
|
2736 | // our current place in the distribution
|
2737 | var x = 0;
|
2738 | // and we keep track of the current cumulative probability, in
|
2739 | // order to know when to stop calculating chances.
|
2740 | var cumulativeProbability = 0;
|
2741 | // the calculated cells to be returned
|
2742 | var cells = [];
|
2743 | var factorialX = 1;
|
2744 |
|
2745 | // This algorithm iterates through each potential outcome,
|
2746 | // until the `cumulativeProbability` is very close to 1, at
|
2747 | // which point we've defined the vast majority of outcomes
|
2748 | do {
|
2749 | // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
|
2750 | cells[x] = (Math.exp(-lambda) * Math.pow(lambda, x)) / factorialX;
|
2751 | cumulativeProbability += cells[x];
|
2752 | x++;
|
2753 | factorialX *= x;
|
2754 | // when the cumulativeProbability is nearly 1, we've calculated
|
2755 | // the useful range of this distribution
|
2756 | } while (cumulativeProbability < 1 - epsilon);
|
2757 |
|
2758 | return cells;
|
2759 | }
|
2760 |
|
2761 | /**
|
2762 | * **Percentage Points of the χ2 (Chi-Squared) Distribution**
|
2763 | *
|
2764 | * The [χ2 (Chi-Squared) Distribution](http://en.wikipedia.org/wiki/Chi-squared_distribution) is used in the common
|
2765 | * chi-squared tests for goodness of fit of an observed distribution to a theoretical one, the independence of two
|
2766 | * criteria of classification of qualitative data, and in confidence interval estimation for a population standard
|
2767 | * deviation of a normal distribution from a sample standard deviation.
|
2768 | *
|
2769 | * Values from Appendix 1, Table III of William W. Hines & Douglas C. Montgomery, "Probability and Statistics in
|
2770 | * Engineering and Management Science", Wiley (1980).
|
2771 | */
|
2772 | var chiSquaredDistributionTable = {
|
2773 | 1: {
|
2774 | 0.995: 0,
|
2775 | 0.99: 0,
|
2776 | 0.975: 0,
|
2777 | 0.95: 0,
|
2778 | 0.9: 0.02,
|
2779 | 0.5: 0.45,
|
2780 | 0.1: 2.71,
|
2781 | 0.05: 3.84,
|
2782 | 0.025: 5.02,
|
2783 | 0.01: 6.63,
|
2784 | 0.005: 7.88
|
2785 | },
|
2786 | 2: {
|
2787 | 0.995: 0.01,
|
2788 | 0.99: 0.02,
|
2789 | 0.975: 0.05,
|
2790 | 0.95: 0.1,
|
2791 | 0.9: 0.21,
|
2792 | 0.5: 1.39,
|
2793 | 0.1: 4.61,
|
2794 | 0.05: 5.99,
|
2795 | 0.025: 7.38,
|
2796 | 0.01: 9.21,
|
2797 | 0.005: 10.6
|
2798 | },
|
2799 | 3: {
|
2800 | 0.995: 0.07,
|
2801 | 0.99: 0.11,
|
2802 | 0.975: 0.22,
|
2803 | 0.95: 0.35,
|
2804 | 0.9: 0.58,
|
2805 | 0.5: 2.37,
|
2806 | 0.1: 6.25,
|
2807 | 0.05: 7.81,
|
2808 | 0.025: 9.35,
|
2809 | 0.01: 11.34,
|
2810 | 0.005: 12.84
|
2811 | },
|
2812 | 4: {
|
2813 | 0.995: 0.21,
|
2814 | 0.99: 0.3,
|
2815 | 0.975: 0.48,
|
2816 | 0.95: 0.71,
|
2817 | 0.9: 1.06,
|
2818 | 0.5: 3.36,
|
2819 | 0.1: 7.78,
|
2820 | 0.05: 9.49,
|
2821 | 0.025: 11.14,
|
2822 | 0.01: 13.28,
|
2823 | 0.005: 14.86
|
2824 | },
|
2825 | 5: {
|
2826 | 0.995: 0.41,
|
2827 | 0.99: 0.55,
|
2828 | 0.975: 0.83,
|
2829 | 0.95: 1.15,
|
2830 | 0.9: 1.61,
|
2831 | 0.5: 4.35,
|
2832 | 0.1: 9.24,
|
2833 | 0.05: 11.07,
|
2834 | 0.025: 12.83,
|
2835 | 0.01: 15.09,
|
2836 | 0.005: 16.75
|
2837 | },
|
2838 | 6: {
|
2839 | 0.995: 0.68,
|
2840 | 0.99: 0.87,
|
2841 | 0.975: 1.24,
|
2842 | 0.95: 1.64,
|
2843 | 0.9: 2.2,
|
2844 | 0.5: 5.35,
|
2845 | 0.1: 10.65,
|
2846 | 0.05: 12.59,
|
2847 | 0.025: 14.45,
|
2848 | 0.01: 16.81,
|
2849 | 0.005: 18.55
|
2850 | },
|
2851 | 7: {
|
2852 | 0.995: 0.99,
|
2853 | 0.99: 1.25,
|
2854 | 0.975: 1.69,
|
2855 | 0.95: 2.17,
|
2856 | 0.9: 2.83,
|
2857 | 0.5: 6.35,
|
2858 | 0.1: 12.02,
|
2859 | 0.05: 14.07,
|
2860 | 0.025: 16.01,
|
2861 | 0.01: 18.48,
|
2862 | 0.005: 20.28
|
2863 | },
|
2864 | 8: {
|
2865 | 0.995: 1.34,
|
2866 | 0.99: 1.65,
|
2867 | 0.975: 2.18,
|
2868 | 0.95: 2.73,
|
2869 | 0.9: 3.49,
|
2870 | 0.5: 7.34,
|
2871 | 0.1: 13.36,
|
2872 | 0.05: 15.51,
|
2873 | 0.025: 17.53,
|
2874 | 0.01: 20.09,
|
2875 | 0.005: 21.96
|
2876 | },
|
2877 | 9: {
|
2878 | 0.995: 1.73,
|
2879 | 0.99: 2.09,
|
2880 | 0.975: 2.7,
|
2881 | 0.95: 3.33,
|
2882 | 0.9: 4.17,
|
2883 | 0.5: 8.34,
|
2884 | 0.1: 14.68,
|
2885 | 0.05: 16.92,
|
2886 | 0.025: 19.02,
|
2887 | 0.01: 21.67,
|
2888 | 0.005: 23.59
|
2889 | },
|
2890 | 10: {
|
2891 | 0.995: 2.16,
|
2892 | 0.99: 2.56,
|
2893 | 0.975: 3.25,
|
2894 | 0.95: 3.94,
|
2895 | 0.9: 4.87,
|
2896 | 0.5: 9.34,
|
2897 | 0.1: 15.99,
|
2898 | 0.05: 18.31,
|
2899 | 0.025: 20.48,
|
2900 | 0.01: 23.21,
|
2901 | 0.005: 25.19
|
2902 | },
|
2903 | 11: {
|
2904 | 0.995: 2.6,
|
2905 | 0.99: 3.05,
|
2906 | 0.975: 3.82,
|
2907 | 0.95: 4.57,
|
2908 | 0.9: 5.58,
|
2909 | 0.5: 10.34,
|
2910 | 0.1: 17.28,
|
2911 | 0.05: 19.68,
|
2912 | 0.025: 21.92,
|
2913 | 0.01: 24.72,
|
2914 | 0.005: 26.76
|
2915 | },
|
2916 | 12: {
|
2917 | 0.995: 3.07,
|
2918 | 0.99: 3.57,
|
2919 | 0.975: 4.4,
|
2920 | 0.95: 5.23,
|
2921 | 0.9: 6.3,
|
2922 | 0.5: 11.34,
|
2923 | 0.1: 18.55,
|
2924 | 0.05: 21.03,
|
2925 | 0.025: 23.34,
|
2926 | 0.01: 26.22,
|
2927 | 0.005: 28.3
|
2928 | },
|
2929 | 13: {
|
2930 | 0.995: 3.57,
|
2931 | 0.99: 4.11,
|
2932 | 0.975: 5.01,
|
2933 | 0.95: 5.89,
|
2934 | 0.9: 7.04,
|
2935 | 0.5: 12.34,
|
2936 | 0.1: 19.81,
|
2937 | 0.05: 22.36,
|
2938 | 0.025: 24.74,
|
2939 | 0.01: 27.69,
|
2940 | 0.005: 29.82
|
2941 | },
|
2942 | 14: {
|
2943 | 0.995: 4.07,
|
2944 | 0.99: 4.66,
|
2945 | 0.975: 5.63,
|
2946 | 0.95: 6.57,
|
2947 | 0.9: 7.79,
|
2948 | 0.5: 13.34,
|
2949 | 0.1: 21.06,
|
2950 | 0.05: 23.68,
|
2951 | 0.025: 26.12,
|
2952 | 0.01: 29.14,
|
2953 | 0.005: 31.32
|
2954 | },
|
2955 | 15: {
|
2956 | 0.995: 4.6,
|
2957 | 0.99: 5.23,
|
2958 | 0.975: 6.27,
|
2959 | 0.95: 7.26,
|
2960 | 0.9: 8.55,
|
2961 | 0.5: 14.34,
|
2962 | 0.1: 22.31,
|
2963 | 0.05: 25,
|
2964 | 0.025: 27.49,
|
2965 | 0.01: 30.58,
|
2966 | 0.005: 32.8
|
2967 | },
|
2968 | 16: {
|
2969 | 0.995: 5.14,
|
2970 | 0.99: 5.81,
|
2971 | 0.975: 6.91,
|
2972 | 0.95: 7.96,
|
2973 | 0.9: 9.31,
|
2974 | 0.5: 15.34,
|
2975 | 0.1: 23.54,
|
2976 | 0.05: 26.3,
|
2977 | 0.025: 28.85,
|
2978 | 0.01: 32,
|
2979 | 0.005: 34.27
|
2980 | },
|
2981 | 17: {
|
2982 | 0.995: 5.7,
|
2983 | 0.99: 6.41,
|
2984 | 0.975: 7.56,
|
2985 | 0.95: 8.67,
|
2986 | 0.9: 10.09,
|
2987 | 0.5: 16.34,
|
2988 | 0.1: 24.77,
|
2989 | 0.05: 27.59,
|
2990 | 0.025: 30.19,
|
2991 | 0.01: 33.41,
|
2992 | 0.005: 35.72
|
2993 | },
|
2994 | 18: {
|
2995 | 0.995: 6.26,
|
2996 | 0.99: 7.01,
|
2997 | 0.975: 8.23,
|
2998 | 0.95: 9.39,
|
2999 | 0.9: 10.87,
|
3000 | 0.5: 17.34,
|
3001 | 0.1: 25.99,
|
3002 | 0.05: 28.87,
|
3003 | 0.025: 31.53,
|
3004 | 0.01: 34.81,
|
3005 | 0.005: 37.16
|
3006 | },
|
3007 | 19: {
|
3008 | 0.995: 6.84,
|
3009 | 0.99: 7.63,
|
3010 | 0.975: 8.91,
|
3011 | 0.95: 10.12,
|
3012 | 0.9: 11.65,
|
3013 | 0.5: 18.34,
|
3014 | 0.1: 27.2,
|
3015 | 0.05: 30.14,
|
3016 | 0.025: 32.85,
|
3017 | 0.01: 36.19,
|
3018 | 0.005: 38.58
|
3019 | },
|
3020 | 20: {
|
3021 | 0.995: 7.43,
|
3022 | 0.99: 8.26,
|
3023 | 0.975: 9.59,
|
3024 | 0.95: 10.85,
|
3025 | 0.9: 12.44,
|
3026 | 0.5: 19.34,
|
3027 | 0.1: 28.41,
|
3028 | 0.05: 31.41,
|
3029 | 0.025: 34.17,
|
3030 | 0.01: 37.57,
|
3031 | 0.005: 40
|
3032 | },
|
3033 | 21: {
|
3034 | 0.995: 8.03,
|
3035 | 0.99: 8.9,
|
3036 | 0.975: 10.28,
|
3037 | 0.95: 11.59,
|
3038 | 0.9: 13.24,
|
3039 | 0.5: 20.34,
|
3040 | 0.1: 29.62,
|
3041 | 0.05: 32.67,
|
3042 | 0.025: 35.48,
|
3043 | 0.01: 38.93,
|
3044 | 0.005: 41.4
|
3045 | },
|
3046 | 22: {
|
3047 | 0.995: 8.64,
|
3048 | 0.99: 9.54,
|
3049 | 0.975: 10.98,
|
3050 | 0.95: 12.34,
|
3051 | 0.9: 14.04,
|
3052 | 0.5: 21.34,
|
3053 | 0.1: 30.81,
|
3054 | 0.05: 33.92,
|
3055 | 0.025: 36.78,
|
3056 | 0.01: 40.29,
|
3057 | 0.005: 42.8
|
3058 | },
|
3059 | 23: {
|
3060 | 0.995: 9.26,
|
3061 | 0.99: 10.2,
|
3062 | 0.975: 11.69,
|
3063 | 0.95: 13.09,
|
3064 | 0.9: 14.85,
|
3065 | 0.5: 22.34,
|
3066 | 0.1: 32.01,
|
3067 | 0.05: 35.17,
|
3068 | 0.025: 38.08,
|
3069 | 0.01: 41.64,
|
3070 | 0.005: 44.18
|
3071 | },
|
3072 | 24: {
|
3073 | 0.995: 9.89,
|
3074 | 0.99: 10.86,
|
3075 | 0.975: 12.4,
|
3076 | 0.95: 13.85,
|
3077 | 0.9: 15.66,
|
3078 | 0.5: 23.34,
|
3079 | 0.1: 33.2,
|
3080 | 0.05: 36.42,
|
3081 | 0.025: 39.36,
|
3082 | 0.01: 42.98,
|
3083 | 0.005: 45.56
|
3084 | },
|
3085 | 25: {
|
3086 | 0.995: 10.52,
|
3087 | 0.99: 11.52,
|
3088 | 0.975: 13.12,
|
3089 | 0.95: 14.61,
|
3090 | 0.9: 16.47,
|
3091 | 0.5: 24.34,
|
3092 | 0.1: 34.28,
|
3093 | 0.05: 37.65,
|
3094 | 0.025: 40.65,
|
3095 | 0.01: 44.31,
|
3096 | 0.005: 46.93
|
3097 | },
|
3098 | 26: {
|
3099 | 0.995: 11.16,
|
3100 | 0.99: 12.2,
|
3101 | 0.975: 13.84,
|
3102 | 0.95: 15.38,
|
3103 | 0.9: 17.29,
|
3104 | 0.5: 25.34,
|
3105 | 0.1: 35.56,
|
3106 | 0.05: 38.89,
|
3107 | 0.025: 41.92,
|
3108 | 0.01: 45.64,
|
3109 | 0.005: 48.29
|
3110 | },
|
3111 | 27: {
|
3112 | 0.995: 11.81,
|
3113 | 0.99: 12.88,
|
3114 | 0.975: 14.57,
|
3115 | 0.95: 16.15,
|
3116 | 0.9: 18.11,
|
3117 | 0.5: 26.34,
|
3118 | 0.1: 36.74,
|
3119 | 0.05: 40.11,
|
3120 | 0.025: 43.19,
|
3121 | 0.01: 46.96,
|
3122 | 0.005: 49.65
|
3123 | },
|
3124 | 28: {
|
3125 | 0.995: 12.46,
|
3126 | 0.99: 13.57,
|
3127 | 0.975: 15.31,
|
3128 | 0.95: 16.93,
|
3129 | 0.9: 18.94,
|
3130 | 0.5: 27.34,
|
3131 | 0.1: 37.92,
|
3132 | 0.05: 41.34,
|
3133 | 0.025: 44.46,
|
3134 | 0.01: 48.28,
|
3135 | 0.005: 50.99
|
3136 | },
|
3137 | 29: {
|
3138 | 0.995: 13.12,
|
3139 | 0.99: 14.26,
|
3140 | 0.975: 16.05,
|
3141 | 0.95: 17.71,
|
3142 | 0.9: 19.77,
|
3143 | 0.5: 28.34,
|
3144 | 0.1: 39.09,
|
3145 | 0.05: 42.56,
|
3146 | 0.025: 45.72,
|
3147 | 0.01: 49.59,
|
3148 | 0.005: 52.34
|
3149 | },
|
3150 | 30: {
|
3151 | 0.995: 13.79,
|
3152 | 0.99: 14.95,
|
3153 | 0.975: 16.79,
|
3154 | 0.95: 18.49,
|
3155 | 0.9: 20.6,
|
3156 | 0.5: 29.34,
|
3157 | 0.1: 40.26,
|
3158 | 0.05: 43.77,
|
3159 | 0.025: 46.98,
|
3160 | 0.01: 50.89,
|
3161 | 0.005: 53.67
|
3162 | },
|
3163 | 40: {
|
3164 | 0.995: 20.71,
|
3165 | 0.99: 22.16,
|
3166 | 0.975: 24.43,
|
3167 | 0.95: 26.51,
|
3168 | 0.9: 29.05,
|
3169 | 0.5: 39.34,
|
3170 | 0.1: 51.81,
|
3171 | 0.05: 55.76,
|
3172 | 0.025: 59.34,
|
3173 | 0.01: 63.69,
|
3174 | 0.005: 66.77
|
3175 | },
|
3176 | 50: {
|
3177 | 0.995: 27.99,
|
3178 | 0.99: 29.71,
|
3179 | 0.975: 32.36,
|
3180 | 0.95: 34.76,
|
3181 | 0.9: 37.69,
|
3182 | 0.5: 49.33,
|
3183 | 0.1: 63.17,
|
3184 | 0.05: 67.5,
|
3185 | 0.025: 71.42,
|
3186 | 0.01: 76.15,
|
3187 | 0.005: 79.49
|
3188 | },
|
3189 | 60: {
|
3190 | 0.995: 35.53,
|
3191 | 0.99: 37.48,
|
3192 | 0.975: 40.48,
|
3193 | 0.95: 43.19,
|
3194 | 0.9: 46.46,
|
3195 | 0.5: 59.33,
|
3196 | 0.1: 74.4,
|
3197 | 0.05: 79.08,
|
3198 | 0.025: 83.3,
|
3199 | 0.01: 88.38,
|
3200 | 0.005: 91.95
|
3201 | },
|
3202 | 70: {
|
3203 | 0.995: 43.28,
|
3204 | 0.99: 45.44,
|
3205 | 0.975: 48.76,
|
3206 | 0.95: 51.74,
|
3207 | 0.9: 55.33,
|
3208 | 0.5: 69.33,
|
3209 | 0.1: 85.53,
|
3210 | 0.05: 90.53,
|
3211 | 0.025: 95.02,
|
3212 | 0.01: 100.42,
|
3213 | 0.005: 104.22
|
3214 | },
|
3215 | 80: {
|
3216 | 0.995: 51.17,
|
3217 | 0.99: 53.54,
|
3218 | 0.975: 57.15,
|
3219 | 0.95: 60.39,
|
3220 | 0.9: 64.28,
|
3221 | 0.5: 79.33,
|
3222 | 0.1: 96.58,
|
3223 | 0.05: 101.88,
|
3224 | 0.025: 106.63,
|
3225 | 0.01: 112.33,
|
3226 | 0.005: 116.32
|
3227 | },
|
3228 | 90: {
|
3229 | 0.995: 59.2,
|
3230 | 0.99: 61.75,
|
3231 | 0.975: 65.65,
|
3232 | 0.95: 69.13,
|
3233 | 0.9: 73.29,
|
3234 | 0.5: 89.33,
|
3235 | 0.1: 107.57,
|
3236 | 0.05: 113.14,
|
3237 | 0.025: 118.14,
|
3238 | 0.01: 124.12,
|
3239 | 0.005: 128.3
|
3240 | },
|
3241 | 100: {
|
3242 | 0.995: 67.33,
|
3243 | 0.99: 70.06,
|
3244 | 0.975: 74.22,
|
3245 | 0.95: 77.93,
|
3246 | 0.9: 82.36,
|
3247 | 0.5: 99.33,
|
3248 | 0.1: 118.5,
|
3249 | 0.05: 124.34,
|
3250 | 0.025: 129.56,
|
3251 | 0.01: 135.81,
|
3252 | 0.005: 140.17
|
3253 | }
|
3254 | };
|
3255 |
|
3256 | /**
|
3257 | * The [χ2 (Chi-Squared) Goodness-of-Fit Test](http://en.wikipedia.org/wiki/Goodness_of_fit#Pearson.27s_chi-squared_test)
|
3258 | * uses a measure of goodness of fit which is the sum of differences between observed and expected outcome frequencies
|
3259 | * (that is, counts of observations), each squared and divided by the number of observations expected given the
|
3260 | * hypothesized distribution. The resulting χ2 statistic, `chiSquared`, can be compared to the chi-squared distribution
|
3261 | * to determine the goodness of fit. In order to determine the degrees of freedom of the chi-squared distribution, one
|
3262 | * takes the total number of observed frequencies and subtracts the number of estimated parameters. The test statistic
|
3263 | * follows, approximately, a chi-square distribution with (k − c) degrees of freedom where `k` is the number of non-empty
|
3264 | * cells and `c` is the number of estimated parameters for the distribution.
|
3265 | *
|
3266 | * @param {Array<number>} data
|
3267 | * @param {Function} distributionType a function that returns a point in a distribution:
|
3268 | * for instance, binomial, bernoulli, or poisson
|
3269 | * @param {number} significance
|
3270 | * @returns {number} chi squared goodness of fit
|
3271 | * @example
|
3272 | * // Data from Poisson goodness-of-fit example 10-19 in William W. Hines & Douglas C. Montgomery,
|
3273 | * // "Probability and Statistics in Engineering and Management Science", Wiley (1980).
|
3274 | * var data1019 = [
|
3275 | * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
3276 | * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
3277 | * 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
3278 | * 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
3279 | * 3, 3, 3, 3
|
3280 | * ];
|
3281 | * ss.chiSquaredGoodnessOfFit(data1019, ss.poissonDistribution, 0.05); //= false
|
3282 | */
|
3283 | function chiSquaredGoodnessOfFit(data, distributionType, significance) {
|
3284 | // Estimate from the sample data, a weighted mean.
|
3285 | var inputMean = mean(data);
|
3286 | // Calculated value of the χ2 statistic.
|
3287 | var chiSquared = 0;
|
3288 | // Number of hypothesized distribution parameters estimated, expected to be supplied in the distribution test.
|
3289 | // Lose one degree of freedom for estimating `lambda` from the sample data.
|
3290 | var c = 1;
|
3291 | // The hypothesized distribution.
|
3292 | // Generate the hypothesized distribution.
|
3293 | var hypothesizedDistribution = distributionType(inputMean);
|
3294 | var observedFrequencies = [];
|
3295 | var expectedFrequencies = [];
|
3296 |
|
3297 | // Create an array holding a histogram from the sample data, of
|
3298 | // the form `{ value: numberOfOcurrences }`
|
3299 | for (var i = 0; i < data.length; i++) {
|
3300 | if (observedFrequencies[data[i]] === undefined) {
|
3301 | observedFrequencies[data[i]] = 0;
|
3302 | }
|
3303 | observedFrequencies[data[i]]++;
|
3304 | }
|
3305 |
|
3306 | // The histogram we created might be sparse - there might be gaps
|
3307 | // between values. So we iterate through the histogram, making
|
3308 | // sure that instead of undefined, gaps have 0 values.
|
3309 | for (var i$1 = 0; i$1 < observedFrequencies.length; i$1++) {
|
3310 | if (observedFrequencies[i$1] === undefined) {
|
3311 | observedFrequencies[i$1] = 0;
|
3312 | }
|
3313 | }
|
3314 |
|
3315 | // Create an array holding a histogram of expected data given the
|
3316 | // sample size and hypothesized distribution.
|
3317 | for (var k in hypothesizedDistribution) {
|
3318 | if (k in observedFrequencies) {
|
3319 | expectedFrequencies[+k] = hypothesizedDistribution[k] * data.length;
|
3320 | }
|
3321 | }
|
3322 |
|
3323 | // Working backward through the expected frequencies, collapse classes
|
3324 | // if less than three observations are expected for a class.
|
3325 | // This transformation is applied to the observed frequencies as well.
|
3326 | for (var k$1 = expectedFrequencies.length - 1; k$1 >= 0; k$1--) {
|
3327 | if (expectedFrequencies[k$1] < 3) {
|
3328 | expectedFrequencies[k$1 - 1] += expectedFrequencies[k$1];
|
3329 | expectedFrequencies.pop();
|
3330 |
|
3331 | observedFrequencies[k$1 - 1] += observedFrequencies[k$1];
|
3332 | observedFrequencies.pop();
|
3333 | }
|
3334 | }
|
3335 |
|
3336 | // Iterate through the squared differences between observed & expected
|
3337 | // frequencies, accumulating the `chiSquared` statistic.
|
3338 | for (var k$2 = 0; k$2 < observedFrequencies.length; k$2++) {
|
3339 | chiSquared +=
|
3340 | Math.pow(observedFrequencies[k$2] - expectedFrequencies[k$2], 2) /
|
3341 | expectedFrequencies[k$2];
|
3342 | }
|
3343 |
|
3344 | // Calculate degrees of freedom for this test and look it up in the
|
3345 | // `chiSquaredDistributionTable` in order to
|
3346 | // accept or reject the goodness-of-fit of the hypothesized distribution.
|
3347 | // Degrees of freedom, calculated as (number of class intervals -
|
3348 | // number of hypothesized distribution parameters estimated - 1)
|
3349 | var degreesOfFreedom = observedFrequencies.length - c - 1;
|
3350 | return (
|
3351 | chiSquaredDistributionTable[degreesOfFreedom][significance] < chiSquared
|
3352 | );
|
3353 | }
|
3354 |
|
3355 | var SQRT_2PI = Math.sqrt(2 * Math.PI);
|
3356 |
|
3357 | /**
|
3358 | * [Well-known kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use)
|
3359 | * @private
|
3360 | */
|
3361 | var kernels = {
|
3362 | /**
|
3363 | * The gaussian kernel.
|
3364 | * @private
|
3365 | */
|
3366 | gaussian: function (u) {
|
3367 | return Math.exp(-0.5 * u * u) / SQRT_2PI;
|
3368 | }
|
3369 | };
|
3370 |
|
3371 | /**
|
3372 | * Well known bandwidth selection methods
|
3373 | * @private
|
3374 | */
|
3375 | var bandwidthMethods = {
|
3376 | /**
|
3377 | * The ["normal reference distribution"
|
3378 | * rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html),
|
3379 | * a commonly used version of [Silverman's
|
3380 | * rule-of-thumb](https://en.wikipedia.org/wiki/Kernel_density_estimation#A_rule-of-thumb_bandwidth_estimator).
|
3381 | * @private
|
3382 | */
|
3383 | nrd: function (x) {
|
3384 | var s = sampleStandardDeviation(x);
|
3385 | var iqr = interquartileRange(x);
|
3386 | if (typeof iqr === "number") {
|
3387 | s = Math.min(s, iqr / 1.34);
|
3388 | }
|
3389 | return 1.06 * s * Math.pow(x.length, -0.2);
|
3390 | }
|
3391 | };
|
3392 |
|
3393 | /**
|
3394 | * [Kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation)
|
3395 | * is a useful tool for, among other things, estimating the shape of the
|
3396 | * underlying probability distribution from a sample.
|
3397 | *
|
3398 | * @name kernelDensityEstimation
|
3399 | * @param X sample values
|
3400 | * @param kernel The kernel function to use. If a function is provided, it should return non-negative values and integrate to 1. Defaults to 'gaussian'.
|
3401 | * @param bandwidthMethod The "bandwidth selection" method to use, or a fixed bandwidth value. Defaults to "nrd", the commonly-used ["normal reference distribution" rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html).
|
3402 | * @returns {Function} An estimated [probability density function](https://en.wikipedia.org/wiki/Probability_density_function) for the given sample. The returned function runs in `O(X.length)`.
|
3403 | */
|
3404 | function kernelDensityEstimation(X, kernel, bandwidthMethod) {
|
3405 | var kernelFn;
|
3406 | if (kernel === undefined) {
|
3407 | kernelFn = kernels.gaussian;
|
3408 | } else if (typeof kernel === "string") {
|
3409 | if (!kernels[kernel]) {
|
3410 | throw new Error('Unknown kernel "' + kernel + '"');
|
3411 | }
|
3412 | kernelFn = kernels[kernel];
|
3413 | } else {
|
3414 | kernelFn = kernel;
|
3415 | }
|
3416 |
|
3417 | var bandwidth;
|
3418 | if (typeof bandwidthMethod === "undefined") {
|
3419 | bandwidth = bandwidthMethods.nrd(X);
|
3420 | } else if (typeof bandwidthMethod === "string") {
|
3421 | if (!bandwidthMethods[bandwidthMethod]) {
|
3422 | throw new Error(
|
3423 | 'Unknown bandwidth method "' + bandwidthMethod + '"'
|
3424 | );
|
3425 | }
|
3426 | bandwidth = bandwidthMethods[bandwidthMethod](X);
|
3427 | } else {
|
3428 | bandwidth = bandwidthMethod;
|
3429 | }
|
3430 |
|
3431 | return function (x) {
|
3432 | var i = 0;
|
3433 | var sum = 0;
|
3434 | for (i = 0; i < X.length; i++) {
|
3435 | sum += kernelFn((x - X[i]) / bandwidth);
|
3436 | }
|
3437 | return sum / bandwidth / X.length;
|
3438 | };
|
3439 | }
|
3440 |
|
3441 | /**
|
3442 | * The [Z-Score, or Standard Score](http://en.wikipedia.org/wiki/Standard_score).
|
3443 | *
|
3444 | * The standard score is the number of standard deviations an observation
|
3445 | * or datum is above or below the mean. Thus, a positive standard score
|
3446 | * represents a datum above the mean, while a negative standard score
|
3447 | * represents a datum below the mean. It is a dimensionless quantity
|
3448 | * obtained by subtracting the population mean from an individual raw
|
3449 | * score and then dividing the difference by the population standard
|
3450 | * deviation.
|
3451 | *
|
3452 | * The z-score is only defined if one knows the population parameters;
|
3453 | * if one only has a sample set, then the analogous computation with
|
3454 | * sample mean and sample standard deviation yields the
|
3455 | * Student's t-statistic.
|
3456 | *
|
3457 | * @param {number} x
|
3458 | * @param {number} mean
|
3459 | * @param {number} standardDeviation
|
3460 | * @return {number} z score
|
3461 | * @example
|
3462 | * zScore(78, 80, 5); // => -0.4
|
3463 | */
|
3464 | function zScore(x, mean, standardDeviation) {
|
3465 | return (x - mean) / standardDeviation;
|
3466 | }
|
3467 |
|
3468 | var SQRT_2PI$1 = Math.sqrt(2 * Math.PI);
|
3469 |
|
3470 | function cumulativeDistribution(z) {
|
3471 | var sum = z,
|
3472 | tmp = z;
|
3473 |
|
3474 | // 15 iterations are enough for 4-digit precision
|
3475 | for (var i = 1; i < 15; i++) {
|
3476 | tmp *= (z * z) / (2 * i + 1);
|
3477 | sum += tmp;
|
3478 | }
|
3479 | return (
|
3480 | Math.round((0.5 + (sum / SQRT_2PI$1) * Math.exp((-z * z) / 2)) * 1e4) /
|
3481 | 1e4
|
3482 | );
|
3483 | }
|
3484 |
|
3485 | /**
|
3486 | * A standard normal table, also called the unit normal table or Z table,
|
3487 | * is a mathematical table for the values of Φ (phi), which are the values of
|
3488 | * the cumulative distribution function of the normal distribution.
|
3489 | * It is used to find the probability that a statistic is observed below,
|
3490 | * above, or between values on the standard normal distribution, and by
|
3491 | * extension, any normal distribution.
|
3492 | *
|
3493 | * The probabilities are calculated using the
|
3494 | * [Cumulative distribution function](https://en.wikipedia.org/wiki/Normal_distribution#Cumulative_distribution_function).
|
3495 | * The table used is the cumulative, and not cumulative from 0 to mean
|
3496 | * (even though the latter has 5 digits precision, instead of 4).
|
3497 | */
|
3498 | var standardNormalTable = [];
|
3499 |
|
3500 | for (var z = 0; z <= 3.09; z += 0.01) {
|
3501 | standardNormalTable.push(cumulativeDistribution(z));
|
3502 | }
|
3503 |
|
3504 | /**
|
3505 | * **[Cumulative Standard Normal Probability](http://en.wikipedia.org/wiki/Standard_normal_table)**
|
3506 | *
|
3507 | * Since probability tables cannot be
|
3508 | * printed for every normal distribution, as there are an infinite variety
|
3509 | * of normal distributions, it is common practice to convert a normal to a
|
3510 | * standard normal and then use the standard normal table to find probabilities.
|
3511 | *
|
3512 | * You can use `.5 + .5 * errorFunction(x / Math.sqrt(2))` to calculate the probability
|
3513 | * instead of looking it up in a table.
|
3514 | *
|
3515 | * @param {number} z
|
3516 | * @returns {number} cumulative standard normal probability
|
3517 | */
|
3518 | function cumulativeStdNormalProbability(z) {
|
3519 | // Calculate the position of this value.
|
3520 | var absZ = Math.abs(z);
|
3521 | // Each row begins with a different
|
3522 | // significant digit: 0.5, 0.6, 0.7, and so on. Each value in the table
|
3523 | // corresponds to a range of 0.01 in the input values, so the value is
|
3524 | // multiplied by 100.
|
3525 | var index = Math.min(
|
3526 | Math.round(absZ * 100),
|
3527 | standardNormalTable.length - 1
|
3528 | );
|
3529 |
|
3530 | // The index we calculate must be in the table as a positive value,
|
3531 | // but we still pay attention to whether the input is positive
|
3532 | // or negative, and flip the output value as a last step.
|
3533 | if (z >= 0) {
|
3534 | return standardNormalTable[index];
|
3535 | } else {
|
3536 | // due to floating-point arithmetic, values in the table with
|
3537 | // 4 significant figures can nevertheless end up as repeating
|
3538 | // fractions when they're computed here.
|
3539 | return +(1 - standardNormalTable[index]).toFixed(4);
|
3540 | }
|
3541 | }
|
3542 |
|
3543 | /**
|
3544 | * **[Gaussian error function](http://en.wikipedia.org/wiki/Error_function)**
|
3545 | *
|
3546 | * The `errorFunction(x/(sd * Math.sqrt(2)))` is the probability that a value in a
|
3547 | * normal distribution with standard deviation sd is within x of the mean.
|
3548 | *
|
3549 | * This function returns a numerical approximation to the exact value.
|
3550 | * It uses Horner's method to evaluate the polynomial of τ (tau).
|
3551 | *
|
3552 | * @param {number} x input
|
3553 | * @return {number} error estimation
|
3554 | * @example
|
3555 | * errorFunction(1).toFixed(2); // => '0.84'
|
3556 | */
|
3557 | function errorFunction(x) {
|
3558 | var t = 1 / (1 + 0.5 * Math.abs(x));
|
3559 | var tau =
|
3560 | t *
|
3561 | Math.exp(
|
3562 | -x * x +
|
3563 | ((((((((0.17087277 * t - 0.82215223) * t + 1.48851587) * t -
|
3564 | 1.13520398) *
|
3565 | t +
|
3566 | 0.27886807) *
|
3567 | t -
|
3568 | 0.18628806) *
|
3569 | t +
|
3570 | 0.09678418) *
|
3571 | t +
|
3572 | 0.37409196) *
|
3573 | t +
|
3574 | 1.00002368) *
|
3575 | t -
|
3576 | 1.26551223
|
3577 | );
|
3578 | if (x >= 0) {
|
3579 | return 1 - tau;
|
3580 | } else {
|
3581 | return tau - 1;
|
3582 | }
|
3583 | }
|
3584 |
|
3585 | /**
|
3586 | * The Inverse [Gaussian error function](http://en.wikipedia.org/wiki/Error_function)
|
3587 | * returns a numerical approximation to the value that would have caused
|
3588 | * `errorFunction()` to return x.
|
3589 | *
|
3590 | * @param {number} x value of error function
|
3591 | * @returns {number} estimated inverted value
|
3592 | */
|
3593 | function inverseErrorFunction(x) {
|
3594 | var a = (8 * (Math.PI - 3)) / (3 * Math.PI * (4 - Math.PI));
|
3595 |
|
3596 | var inv = Math.sqrt(
|
3597 | Math.sqrt(
|
3598 | Math.pow(2 / (Math.PI * a) + Math.log(1 - x * x) / 2, 2) -
|
3599 | Math.log(1 - x * x) / a
|
3600 | ) -
|
3601 | (2 / (Math.PI * a) + Math.log(1 - x * x) / 2)
|
3602 | );
|
3603 |
|
3604 | if (x >= 0) {
|
3605 | return inv;
|
3606 | } else {
|
3607 | return -inv;
|
3608 | }
|
3609 | }
|
3610 |
|
3611 | /**
|
3612 | * The [Probit](http://en.wikipedia.org/wiki/Probit)
|
3613 | * is the inverse of cumulativeStdNormalProbability(),
|
3614 | * and is also known as the normal quantile function.
|
3615 | *
|
3616 | * It returns the number of standard deviations from the mean
|
3617 | * where the p'th quantile of values can be found in a normal distribution.
|
3618 | * So, for example, probit(0.5 + 0.6827/2) ≈ 1 because 68.27% of values are
|
3619 | * normally found within 1 standard deviation above or below the mean.
|
3620 | *
|
3621 | * @param {number} p
|
3622 | * @returns {number} probit
|
3623 | */
|
3624 | function probit(p) {
|
3625 | if (p === 0) {
|
3626 | p = epsilon;
|
3627 | } else if (p >= 1) {
|
3628 | p = 1 - epsilon;
|
3629 | }
|
3630 | return Math.sqrt(2) * inverseErrorFunction(2 * p - 1);
|
3631 | }
|
3632 |
|
3633 | /**
|
3634 | * Conducts a [permutation test](https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests)
|
3635 | * to determine if two data sets are *significantly* different from each other, using
|
3636 | * the difference of means between the groups as the test statistic.
|
3637 | * The function allows for the following hypotheses:
|
3638 | * - two_tail = Null hypothesis: the two distributions are equal.
|
3639 | * - greater = Null hypothesis: observations from sampleX tend to be smaller than those from sampleY.
|
3640 | * - less = Null hypothesis: observations from sampleX tend to be greater than those from sampleY.
|
3641 | * [Learn more about one-tail vs two-tail tests.](https://en.wikipedia.org/wiki/One-_and_two-tailed_tests)
|
3642 | *
|
3643 | * @param {Array<number>} sampleX first dataset (e.g. treatment data)
|
3644 | * @param {Array<number>} sampleY second dataset (e.g. control data)
|
3645 | * @param {string} alternative alternative hypothesis, either 'two_sided' (default), 'greater', or 'less'
|
3646 | * @param {number} k number of values in permutation distribution.
|
3647 | * @param {Function} [randomSource=Math.random] an optional entropy source
|
3648 | * @returns {number} p-value The probability of observing the difference between groups (as or more extreme than what we did), assuming the null hypothesis.
|
3649 | *
|
3650 | * @example
|
3651 | * var control = [2, 5, 3, 6, 7, 2, 5];
|
3652 | * var treatment = [20, 5, 13, 12, 7, 2, 2];
|
3653 | * permutationTest(control, treatment); // ~0.1324
|
3654 | */
|
3655 | function permutationTest(sampleX, sampleY, alternative, k, randomSource) {
|
3656 | // Set default arguments
|
3657 | if (k === undefined) {
|
3658 | k = 10000;
|
3659 | }
|
3660 | if (alternative === undefined) {
|
3661 | alternative = "two_side";
|
3662 | }
|
3663 | if (
|
3664 | alternative !== "two_side" &&
|
3665 | alternative !== "greater" &&
|
3666 | alternative !== "less"
|
3667 | ) {
|
3668 | throw new Error(
|
3669 | "`alternative` must be either 'two_side', 'greater', or 'less'"
|
3670 | );
|
3671 | }
|
3672 |
|
3673 | // get means for each sample
|
3674 | var meanX = mean(sampleX);
|
3675 | var meanY = mean(sampleY);
|
3676 |
|
3677 | // calculate initial test statistic. This will be our point of comparison with
|
3678 | // the generated test statistics.
|
3679 | var testStatistic = meanX - meanY;
|
3680 |
|
3681 | // create test-statistic distribution
|
3682 | var testStatDsn = new Array(k);
|
3683 |
|
3684 | // combine datsets so we can easily shuffle later
|
3685 | var allData = sampleX.concat(sampleY);
|
3686 | var midIndex = Math.floor(allData.length / 2);
|
3687 |
|
3688 | for (var i = 0; i < k; i++) {
|
3689 | // 1. shuffle data assignments
|
3690 | shuffleInPlace(allData, randomSource);
|
3691 | var permLeft = allData.slice(0, midIndex);
|
3692 | var permRight = allData.slice(midIndex, allData.length);
|
3693 |
|
3694 | // 2.re-calculate test statistic
|
3695 | var permTestStatistic = mean(permLeft) - mean(permRight);
|
3696 |
|
3697 | // 3. store test statistic to build test statistic distribution
|
3698 | testStatDsn[i] = permTestStatistic;
|
3699 | }
|
3700 |
|
3701 | // Calculate p-value depending on alternative
|
3702 | // For this test, we calculate the percentage of 'extreme' test statistics (subject to our hypothesis)
|
3703 | // more info on permutation test p-value calculations: https://onlinecourses.science.psu.edu/stat464/node/35
|
3704 | var numExtremeTStats = 0;
|
3705 | if (alternative === "two_side") {
|
3706 | for (var i$1 = 0; i$1 <= k; i$1++) {
|
3707 | if (Math.abs(testStatDsn[i$1]) >= Math.abs(testStatistic)) {
|
3708 | numExtremeTStats += 1;
|
3709 | }
|
3710 | }
|
3711 | } else if (alternative === "greater") {
|
3712 | for (var i$2 = 0; i$2 <= k; i$2++) {
|
3713 | if (testStatDsn[i$2] >= testStatistic) {
|
3714 | numExtremeTStats += 1;
|
3715 | }
|
3716 | }
|
3717 | } else {
|
3718 | // alternative === 'less'
|
3719 | for (var i$3 = 0; i$3 <= k; i$3++) {
|
3720 | if (testStatDsn[i$3] <= testStatistic) {
|
3721 | numExtremeTStats += 1;
|
3722 | }
|
3723 | }
|
3724 | }
|
3725 |
|
3726 | return numExtremeTStats / k;
|
3727 | }
|
3728 |
|
3729 | /**
|
3730 | * [Sign](https://en.wikipedia.org/wiki/Sign_function) is a function
|
3731 | * that extracts the sign of a real number
|
3732 | *
|
3733 | * @param {number} x input value
|
3734 | * @returns {number} sign value either 1, 0 or -1
|
3735 | * @throws {TypeError} if the input argument x is not a number
|
3736 | * @private
|
3737 | *
|
3738 | * @example
|
3739 | * sign(2); // => 1
|
3740 | */
|
3741 | function sign(x) {
|
3742 | if (typeof x === "number") {
|
3743 | if (x < 0) {
|
3744 | return -1;
|
3745 | } else if (x === 0) {
|
3746 | return 0;
|
3747 | } else {
|
3748 | return 1;
|
3749 | }
|
3750 | } else {
|
3751 | throw new TypeError("not a number");
|
3752 | }
|
3753 | }
|
3754 |
|
3755 | /**
|
3756 | * [Bisection method](https://en.wikipedia.org/wiki/Bisection_method) is a root-finding
|
3757 | * method that repeatedly bisects an interval to find the root.
|
3758 | *
|
3759 | * This function returns a numerical approximation to the exact value.
|
3760 | *
|
3761 | * @param {Function} func input function
|
3762 | * @param {number} start - start of interval
|
3763 | * @param {number} end - end of interval
|
3764 | * @param {number} maxIterations - the maximum number of iterations
|
3765 | * @param {number} errorTolerance - the error tolerance
|
3766 | * @returns {number} estimated root value
|
3767 | * @throws {TypeError} Argument func must be a function
|
3768 | *
|
3769 | * @example
|
3770 | * bisect(Math.cos,0,4,100,0.003); // => 1.572265625
|
3771 | */
|
3772 | function bisect(func, start, end, maxIterations, errorTolerance) {
|
3773 | if (typeof func !== "function")
|
3774 | { throw new TypeError("func must be a function"); }
|
3775 |
|
3776 | for (var i = 0; i < maxIterations; i++) {
|
3777 | var output = (start + end) / 2;
|
3778 |
|
3779 | if (
|
3780 | func(output) === 0 ||
|
3781 | Math.abs((end - start) / 2) < errorTolerance
|
3782 | ) {
|
3783 | return output;
|
3784 | }
|
3785 |
|
3786 | if (sign(func(output)) === sign(func(start))) {
|
3787 | start = output;
|
3788 | } else {
|
3789 | end = output;
|
3790 | }
|
3791 | }
|
3792 |
|
3793 | throw new Error("maximum number of iterations exceeded");
|
3794 | }
|
3795 |
|
3796 | /**
|
3797 | * Calculate Euclidean distance between two points.
|
3798 | * @param {Array<number>} left First N-dimensional point.
|
3799 | * @param {Array<number>} right Second N-dimensional point.
|
3800 | * @returns {number} Distance.
|
3801 | */
|
3802 | function euclideanDistance(left, right) {
|
3803 | var sum = 0;
|
3804 | for (var i = 0; i < left.length; i++) {
|
3805 | var diff = left[i] - right[i];
|
3806 | sum += diff * diff;
|
3807 | }
|
3808 | return Math.sqrt(sum);
|
3809 | }
|
3810 |
|
3811 | /**
|
3812 | * @typedef {Object} kMeansReturn
|
3813 | * @property {Array<number>} labels The labels.
|
3814 | * @property {Array<Array<number>>} centroids The cluster centroids.
|
3815 | */
|
3816 |
|
3817 | /**
|
3818 | * Perform k-means clustering.
|
3819 | *
|
3820 | * @param {Array<Array<number>>} points N-dimensional coordinates of points to be clustered.
|
3821 | * @param {number} numCluster How many clusters to create.
|
3822 | * @param {Function} randomSource An optional entropy source that generates uniform values in [0, 1).
|
3823 | * @return {kMeansReturn} Labels (same length as data) and centroids (same length as numCluster).
|
3824 | * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
|
3825 | *
|
3826 | * @example
|
3827 | * kMeansCluster([[0.0, 0.5], [1.0, 0.5]], 2); // => {labels: [0, 1], centroids: [[0.0, 0.5], [1.0 0.5]]}
|
3828 | */
|
3829 | function kMeansCluster(points, numCluster, randomSource) {
|
3830 | if ( randomSource === void 0 ) randomSource = Math.random;
|
3831 |
|
3832 | var oldCentroids = null;
|
3833 | var newCentroids = sample(points, numCluster, randomSource);
|
3834 | var labels = null;
|
3835 | var change = Number.MAX_VALUE;
|
3836 | while (change !== 0) {
|
3837 | labels = labelPoints(points, newCentroids);
|
3838 | oldCentroids = newCentroids;
|
3839 | newCentroids = calculateCentroids(points, labels, numCluster);
|
3840 | change = calculateChange(newCentroids, oldCentroids);
|
3841 | }
|
3842 | return {
|
3843 | labels: labels,
|
3844 | centroids: newCentroids
|
3845 | };
|
3846 | }
|
3847 |
|
3848 | /**
|
3849 | * Label each point according to which centroid it is closest to.
|
3850 | *
|
3851 | * @private
|
3852 | * @param {Array<Array<number>>} points Array of XY coordinates.
|
3853 | * @param {Array<Array<number>>} centroids Current centroids.
|
3854 | * @return {Array<number>} Group labels.
|
3855 | */
|
3856 | function labelPoints(points, centroids) {
|
3857 | return points.map(function (p) {
|
3858 | var minDist = Number.MAX_VALUE;
|
3859 | var label = -1;
|
3860 | for (var i = 0; i < centroids.length; i++) {
|
3861 | var dist = euclideanDistance(p, centroids[i]);
|
3862 | if (dist < minDist) {
|
3863 | minDist = dist;
|
3864 | label = i;
|
3865 | }
|
3866 | }
|
3867 | return label;
|
3868 | });
|
3869 | }
|
3870 |
|
3871 | /**
|
3872 | * Calculate centroids for points given labels.
|
3873 | *
|
3874 | * @private
|
3875 | * @param {Array<Array<number>>} points Array of XY coordinates.
|
3876 | * @param {Array<number>} labels Which groups points belong to.
|
3877 | * @param {number} numCluster Number of clusters being created.
|
3878 | * @return {Array<Array<number>>} Centroid for each group.
|
3879 | * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
|
3880 | */
|
3881 | function calculateCentroids(points, labels, numCluster) {
|
3882 | // Initialize accumulators.
|
3883 | var dimension = points[0].length;
|
3884 | var centroids = makeMatrix(numCluster, dimension);
|
3885 | var counts = Array(numCluster).fill(0);
|
3886 |
|
3887 | // Add points to centroids' accumulators and count points per centroid.
|
3888 | var numPoints = points.length;
|
3889 | for (var i = 0; i < numPoints; i++) {
|
3890 | var point = points[i];
|
3891 | var label = labels[i];
|
3892 | var current = centroids[label];
|
3893 | for (var j = 0; j < dimension; j++) {
|
3894 | current[j] += point[j];
|
3895 | }
|
3896 | counts[label] += 1;
|
3897 | }
|
3898 |
|
3899 | // Rescale centroids, checking for any that have no points.
|
3900 | for (var i$1 = 0; i$1 < numCluster; i$1++) {
|
3901 | if (counts[i$1] === 0) {
|
3902 | throw new Error(("Centroid " + i$1 + " has no friends"));
|
3903 | }
|
3904 | var centroid = centroids[i$1];
|
3905 | for (var j$1 = 0; j$1 < dimension; j$1++) {
|
3906 | centroid[j$1] /= counts[i$1];
|
3907 | }
|
3908 | }
|
3909 |
|
3910 | return centroids;
|
3911 | }
|
3912 |
|
3913 | /**
|
3914 | * Calculate the difference between old centroids and new centroids.
|
3915 | *
|
3916 | * @private
|
3917 | * @param {Array<Array<number>>} left One list of centroids.
|
3918 | * @param {Array<Array<number>>} right Another list of centroids.
|
3919 | * @return {number} Distance between centroids.
|
3920 | */
|
3921 | function calculateChange(left, right) {
|
3922 | var total = 0;
|
3923 | for (var i = 0; i < left.length; i++) {
|
3924 | total += euclideanDistance(left[i], right[i]);
|
3925 | }
|
3926 | return total;
|
3927 | }
|
3928 |
|
3929 | /**
|
3930 | * Calculate the [silhouette values](https://en.wikipedia.org/wiki/Silhouette_(clustering))
|
3931 | * for clustered data.
|
3932 | *
|
3933 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
3934 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
3935 | * and values must lie in [0..G-1], where G is the number of groups.
|
3936 | * @return {Array<number>} The silhouette value for each point.
|
3937 | *
|
3938 | * @example
|
3939 | * silhouette([[0.25], [0.75]], [0, 0]); // => [1.0, 1.0]
|
3940 | */
|
3941 | function silhouette(points, labels) {
|
3942 | if (points.length !== labels.length) {
|
3943 | throw new Error("must have exactly as many labels as points");
|
3944 | }
|
3945 | var groupings = createGroups(labels);
|
3946 | var distances = calculateAllDistances(points);
|
3947 | var result = [];
|
3948 | for (var i = 0; i < points.length; i++) {
|
3949 | var s = 0;
|
3950 | if (groupings[labels[i]].length > 1) {
|
3951 | var a = meanDistanceFromPointToGroup(
|
3952 | i,
|
3953 | groupings[labels[i]],
|
3954 | distances
|
3955 | );
|
3956 | var b = meanDistanceToNearestGroup(
|
3957 | i,
|
3958 | labels,
|
3959 | groupings,
|
3960 | distances
|
3961 | );
|
3962 | s = (b - a) / Math.max(a, b);
|
3963 | }
|
3964 | result.push(s);
|
3965 | }
|
3966 | return result;
|
3967 | }
|
3968 |
|
3969 | /**
|
3970 | * Create a lookup table mapping group IDs to point IDs.
|
3971 | *
|
3972 | * @private
|
3973 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
3974 | * and values must lie in [0..G-1], where G is the number of groups.
|
3975 | * @return {Array<Array<number>>} An array of length G, each of whose entries is an array
|
3976 | * containing the indices of the points in that group.
|
3977 | */
|
3978 | function createGroups(labels) {
|
3979 | var numGroups = 1 + max(labels);
|
3980 | var result = Array(numGroups);
|
3981 | for (var i = 0; i < labels.length; i++) {
|
3982 | var label = labels[i];
|
3983 | if (result[label] === undefined) {
|
3984 | result[label] = [];
|
3985 | }
|
3986 | result[label].push(i);
|
3987 | }
|
3988 | return result;
|
3989 | }
|
3990 |
|
3991 | /**
|
3992 | * Create a lookup table of all inter-point distances.
|
3993 | *
|
3994 | * @private
|
3995 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
3996 | * @return {Array<Array<number>>} A symmetric square array of inter-point distances
|
3997 | * (zero on the diagonal).
|
3998 | */
|
3999 | function calculateAllDistances(points) {
|
4000 | var numPoints = points.length;
|
4001 | var result = makeMatrix(numPoints, numPoints);
|
4002 | for (var i = 0; i < numPoints; i++) {
|
4003 | for (var j = 0; j < i; j++) {
|
4004 | result[i][j] = euclideanDistance(points[i], points[j]);
|
4005 | result[j][i] = result[i][j];
|
4006 | }
|
4007 | }
|
4008 | return result;
|
4009 | }
|
4010 |
|
4011 | /**
|
4012 | * Calculate the mean distance between this point and all the points in the
|
4013 | * nearest group (as determined by which point in another group is closest).
|
4014 | *
|
4015 | * @private
|
4016 | * @param {number} which The index of this point.
|
4017 | * @param {Array<number>} labels Labels of points.
|
4018 | * @param {Array<Array<number>>} groupings An array whose entries are arrays
|
4019 | * containing the indices of the points in that group.
|
4020 | * @param {Array<Array<number>>} distances A symmetric square array of inter-point
|
4021 | * distances.
|
4022 | * @return {number} The mean distance from this point to others in the nearest
|
4023 | * group.
|
4024 | */
|
4025 | function meanDistanceToNearestGroup(which, labels, groupings, distances) {
|
4026 | var label = labels[which];
|
4027 | var result = Number.MAX_VALUE;
|
4028 | for (var i = 0; i < groupings.length; i++) {
|
4029 | if (i !== label) {
|
4030 | var d = meanDistanceFromPointToGroup(
|
4031 | which,
|
4032 | groupings[i],
|
4033 | distances
|
4034 | );
|
4035 | if (d < result) {
|
4036 | result = d;
|
4037 | }
|
4038 | }
|
4039 | }
|
4040 | return result;
|
4041 | }
|
4042 |
|
4043 | /**
|
4044 | * Calculate the mean distance between a point and all the points in a group
|
4045 | * (possibly its own).
|
4046 | *
|
4047 | * @private
|
4048 | * @param {number} which The index of this point.
|
4049 | * @param {Array<number>} group The indices of all the points in the group in
|
4050 | * question.
|
4051 | * @param {Array<Array<number>>} distances A symmetric square array of inter-point
|
4052 | * distances.
|
4053 | * @return {number} The mean distance from this point to others in the
|
4054 | * specified group.
|
4055 | */
|
4056 | function meanDistanceFromPointToGroup(which, group, distances) {
|
4057 | var total = 0;
|
4058 | for (var i = 0; i < group.length; i++) {
|
4059 | total += distances[which][group[i]];
|
4060 | }
|
4061 | return total / group.length;
|
4062 | }
|
4063 |
|
4064 | /**
|
4065 | * Calculate the [silhouette metric](https://en.wikipedia.org/wiki/Silhouette_(clustering))
|
4066 | * for a set of N-dimensional points arranged in groups. The metric is the largest
|
4067 | * individual silhouette value for the data.
|
4068 | *
|
4069 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
4070 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
4071 | * and values must lie in [0..G-1], where G is the number of groups.
|
4072 | * @return {number} The silhouette metric for the groupings.
|
4073 | *
|
4074 | * @example
|
4075 | * silhouetteMetric([[0.25], [0.75]], [0, 0]); // => 1.0
|
4076 | */
|
4077 | function silhouetteMetric(points, labels) {
|
4078 | var values = silhouette(points, labels);
|
4079 | return max(values);
|
4080 | }
|
4081 |
|
4082 | /**
|
4083 | * Relative error.
|
4084 | *
|
4085 | * This is more difficult to calculate than it first appears [1,2]. The usual
|
4086 | * formula for the relative error between an actual value A and an expected
|
4087 | * value E is `|(A-E)/E|`, but:
|
4088 | *
|
4089 | * 1. If the expected value is 0, any other value has infinite relative error,
|
4090 | * which is counter-intuitive: if the expected voltage is 0, getting 1/10th
|
4091 | * of a volt doesn't feel like an infinitely large error.
|
4092 | *
|
4093 | * 2. This formula does not satisfy the mathematical definition of a metric [3].
|
4094 | * [4] solved this problem by defining the relative error as `|ln(|A/E|)|`,
|
4095 | * but that formula only works if all values are positive: for example, it
|
4096 | * reports the relative error of -10 and 10 as 0.
|
4097 | *
|
4098 | * Our implementation sticks with convention and returns:
|
4099 | *
|
4100 | * - 0 if the actual and expected values are both zero
|
4101 | * - Infinity if the actual value is non-zero and the expected value is zero
|
4102 | * - `|(A-E)/E|` in all other cases
|
4103 | *
|
4104 | * [1] https://math.stackexchange.com/questions/677852/how-to-calculate-relative-error-when-true-value-is-zero
|
4105 | * [2] https://en.wikipedia.org/wiki/Relative_change_and_difference
|
4106 | * [3] https://en.wikipedia.org/wiki/Metric_(mathematics)#Definition
|
4107 | * [4] F.W.J. Olver: "A New Approach to Error Arithmetic." SIAM Journal on
|
4108 | * Numerical Analysis, 15(2), 1978, 10.1137/0715024.
|
4109 | *
|
4110 | * @param {number} actual The actual value.
|
4111 | * @param {number} expected The expected value.
|
4112 | * @return {number} The relative error.
|
4113 | */
|
4114 | function relativeError(actual, expected) {
|
4115 | if (actual === 0 && expected === 0) {
|
4116 | return 0;
|
4117 | }
|
4118 | return Math.abs((actual - expected) / expected);
|
4119 | }
|
4120 |
|
4121 | /**
|
4122 | * Approximate equality.
|
4123 | *
|
4124 | * @param {number} actual The value to be tested.
|
4125 | * @param {number} expected The reference value.
|
4126 | * @param {number} tolerance The acceptable relative difference.
|
4127 | * @return {boolean} Whether numbers are within tolerance.
|
4128 | */
|
4129 | function approxEqual(actual, expected, tolerance) {
|
4130 | if ( tolerance === void 0 ) tolerance = epsilon;
|
4131 |
|
4132 | return relativeError(actual, expected) <= tolerance;
|
4133 | }
|
4134 |
|
4135 | exports.BayesianClassifier = BayesianClassifier;
|
4136 | exports.PerceptronModel = PerceptronModel;
|
4137 | exports.addToMean = addToMean;
|
4138 | exports.approxEqual = approxEqual;
|
4139 | exports.average = mean;
|
4140 | exports.averageSimple = meanSimple;
|
4141 | exports.bayesian = BayesianClassifier;
|
4142 | exports.bernoulliDistribution = bernoulliDistribution;
|
4143 | exports.binomialDistribution = binomialDistribution;
|
4144 | exports.bisect = bisect;
|
4145 | exports.chiSquaredDistributionTable = chiSquaredDistributionTable;
|
4146 | exports.chiSquaredGoodnessOfFit = chiSquaredGoodnessOfFit;
|
4147 | exports.chunk = chunk;
|
4148 | exports.ckmeans = ckmeans;
|
4149 | exports.combinations = combinations;
|
4150 | exports.combinationsReplacement = combinationsReplacement;
|
4151 | exports.combineMeans = combineMeans;
|
4152 | exports.combineVariances = combineVariances;
|
4153 | exports.cumulativeStdNormalProbability = cumulativeStdNormalProbability;
|
4154 | exports.epsilon = epsilon;
|
4155 | exports.equalIntervalBreaks = equalIntervalBreaks;
|
4156 | exports.erf = errorFunction;
|
4157 | exports.errorFunction = errorFunction;
|
4158 | exports.extent = extent;
|
4159 | exports.extentSorted = extentSorted;
|
4160 | exports.factorial = factorial;
|
4161 | exports.gamma = gamma;
|
4162 | exports.gammaln = gammaln;
|
4163 | exports.geometricMean = geometricMean;
|
4164 | exports.harmonicMean = harmonicMean;
|
4165 | exports.interquartileRange = interquartileRange;
|
4166 | exports.inverseErrorFunction = inverseErrorFunction;
|
4167 | exports.iqr = interquartileRange;
|
4168 | exports.kMeansCluster = kMeansCluster;
|
4169 | exports.kde = kernelDensityEstimation;
|
4170 | exports.kernelDensityEstimation = kernelDensityEstimation;
|
4171 | exports.linearRegression = linearRegression;
|
4172 | exports.linearRegressionLine = linearRegressionLine;
|
4173 | exports.mad = medianAbsoluteDeviation;
|
4174 | exports.max = max;
|
4175 | exports.maxSorted = maxSorted;
|
4176 | exports.mean = mean;
|
4177 | exports.meanSimple = meanSimple;
|
4178 | exports.median = median;
|
4179 | exports.medianAbsoluteDeviation = medianAbsoluteDeviation;
|
4180 | exports.medianSorted = medianSorted;
|
4181 | exports.min = min;
|
4182 | exports.minSorted = minSorted;
|
4183 | exports.mode = mode;
|
4184 | exports.modeFast = modeFast;
|
4185 | exports.modeSorted = modeSorted;
|
4186 | exports.numericSort = numericSort;
|
4187 | exports.perceptron = PerceptronModel;
|
4188 | exports.permutationTest = permutationTest;
|
4189 | exports.permutationsHeap = permutationsHeap;
|
4190 | exports.poissonDistribution = poissonDistribution;
|
4191 | exports.probit = probit;
|
4192 | exports.product = product;
|
4193 | exports.quantile = quantile;
|
4194 | exports.quantileRank = quantileRank;
|
4195 | exports.quantileRankSorted = quantileRankSorted;
|
4196 | exports.quantileSorted = quantileSorted;
|
4197 | exports.quickselect = quickselect;
|
4198 | exports.rSquared = rSquared;
|
4199 | exports.relativeError = relativeError;
|
4200 | exports.rms = rootMeanSquare;
|
4201 | exports.rootMeanSquare = rootMeanSquare;
|
4202 | exports.sample = sample;
|
4203 | exports.sampleCorrelation = sampleCorrelation;
|
4204 | exports.sampleCovariance = sampleCovariance;
|
4205 | exports.sampleKurtosis = sampleKurtosis;
|
4206 | exports.sampleSkewness = sampleSkewness;
|
4207 | exports.sampleStandardDeviation = sampleStandardDeviation;
|
4208 | exports.sampleVariance = sampleVariance;
|
4209 | exports.sampleWithReplacement = sampleWithReplacement;
|
4210 | exports.shuffle = shuffle;
|
4211 | exports.shuffleInPlace = shuffleInPlace;
|
4212 | exports.sign = sign;
|
4213 | exports.silhouette = silhouette;
|
4214 | exports.silhouetteMetric = silhouetteMetric;
|
4215 | exports.standardDeviation = standardDeviation;
|
4216 | exports.standardNormalTable = standardNormalTable;
|
4217 | exports.subtractFromMean = subtractFromMean;
|
4218 | exports.sum = sum;
|
4219 | exports.sumNthPowerDeviations = sumNthPowerDeviations;
|
4220 | exports.sumSimple = sumSimple;
|
4221 | exports.tTest = tTest;
|
4222 | exports.tTestTwoSample = tTestTwoSample;
|
4223 | exports.uniqueCountSorted = uniqueCountSorted;
|
4224 | exports.variance = variance;
|
4225 | exports.zScore = zScore;
|
4226 | //# sourceMappingURL=simple-statistics.js.map
|