UNPKG

133 kBJavaScriptView Raw
1/**
2 * [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
3 * is a simple way to find a fitted line
4 * between a set of coordinates. This algorithm finds the slope and y-intercept of a regression line
5 * using the least sum of squares.
6 *
7 * @param {Array<Array<number>>} data an array of two-element of arrays,
8 * like `[[0, 1], [2, 3]]`
9 * @returns {Object} object containing slope and intersect of regression line
10 * @example
11 * linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 }
12 */
13function linearRegression(data) {
14 var m, b;
15
16 // Store data length in a local variable to reduce
17 // repeated object property lookups
18 var dataLength = data.length;
19
20 //if there's only one point, arbitrarily choose a slope of 0
21 //and a y-intercept of whatever the y of the initial point is
22 if (dataLength === 1) {
23 m = 0;
24 b = data[0][1];
25 } else {
26 // Initialize our sums and scope the `m` and `b`
27 // variables that define the line.
28 var sumX = 0,
29 sumY = 0,
30 sumXX = 0,
31 sumXY = 0;
32
33 // Use local variables to grab point values
34 // with minimal object property lookups
35 var point, x, y;
36
37 // Gather the sum of all x values, the sum of all
38 // y values, and the sum of x^2 and (x*y) for each
39 // value.
40 //
41 // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
42 for (var i = 0; i < dataLength; i++) {
43 point = data[i];
44 x = point[0];
45 y = point[1];
46
47 sumX += x;
48 sumY += y;
49
50 sumXX += x * x;
51 sumXY += x * y;
52 }
53
54 // `m` is the slope of the regression line
55 m =
56 (dataLength * sumXY - sumX * sumY) /
57 (dataLength * sumXX - sumX * sumX);
58
59 // `b` is the y-intercept of the line.
60 b = sumY / dataLength - (m * sumX) / dataLength;
61 }
62
63 // Return both values as an object.
64 return {
65 m: m,
66 b: b
67 };
68}
69
70/**
71 * Given the output of `linearRegression`: an object
72 * with `m` and `b` values indicating slope and intercept,
73 * respectively, generate a line function that translates
74 * x values into y values.
75 *
76 * @param {Object} mb object with `m` and `b` members, representing
77 * slope and intersect of desired line
78 * @returns {Function} method that computes y-value at any given
79 * x-value on the line.
80 * @example
81 * var l = linearRegressionLine(linearRegression([[0, 0], [1, 1]]));
82 * l(0) // = 0
83 * l(2) // = 2
84 * linearRegressionLine({ b: 0, m: 1 })(1); // => 1
85 * linearRegressionLine({ b: 1, m: 1 })(1); // => 2
86 */
87function linearRegressionLine(mb /*: { b: number, m: number }*/) {
88 // Return a function that computes a `y` value for each
89 // x value it is given, based on the values of `b` and `a`
90 // that we just computed.
91 return function (x) {
92 return mb.b + mb.m * x;
93 };
94}
95
96/**
97 * Our default sum is the [Kahan-Babuska algorithm](https://pdfs.semanticscholar.org/1760/7d467cda1d0277ad272deb2113533131dc09.pdf).
98 * This method is an improvement over the classical
99 * [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
100 * It aims at computing the sum of a list of numbers while correcting for
101 * floating-point errors. Traditionally, sums are calculated as many
102 * successive additions, each one with its own floating-point roundoff. These
103 * losses in precision add up as the number of numbers increases. This alternative
104 * algorithm is more accurate than the simple way of calculating sums by simple
105 * addition.
106 *
107 * This runs in `O(n)`, linear time, with respect to the length of the array.
108 *
109 * @param {Array<number>} x input
110 * @return {number} sum of all input numbers
111 * @example
112 * sum([1, 2, 3]); // => 6
113 */
114function sum(x) {
115 // If the array is empty, we needn't bother computing its sum
116 if (x.length === 0) {
117 return 0;
118 }
119
120 // Initializing the sum as the first number in the array
121 var sum = x[0];
122
123 // Keeping track of the floating-point error correction
124 var correction = 0;
125
126 var transition;
127
128 for (var i = 1; i < x.length; i++) {
129 transition = sum + x[i];
130
131 // Here we need to update the correction in a different fashion
132 // if the new absolute value is greater than the absolute sum
133 if (Math.abs(sum) >= Math.abs(x[i])) {
134 correction += sum - transition + x[i];
135 } else {
136 correction += x[i] - transition + sum;
137 }
138
139 sum = transition;
140 }
141
142 // Returning the corrected sum
143 return sum + correction;
144}
145
146/**
147 * The mean, _also known as average_,
148 * is the sum of all values over the number of values.
149 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
150 * a method of finding a typical or central value of a set of numbers.
151 *
152 * This runs in `O(n)`, linear time, with respect to the length of the array.
153 *
154 * @param {Array<number>} x sample of one or more data points
155 * @throws {Error} if the the length of x is less than one
156 * @returns {number} mean
157 * @example
158 * mean([0, 10]); // => 5
159 */
160function mean(x) {
161 if (x.length === 0) {
162 throw new Error("mean requires at least one data point");
163 }
164
165 return sum(x) / x.length;
166}
167
168/**
169 * The sum of deviations to the Nth power.
170 * When n=2 it's the sum of squared deviations.
171 * When n=3 it's the sum of cubed deviations.
172 *
173 * @param {Array<number>} x
174 * @param {number} n power
175 * @returns {number} sum of nth power deviations
176 *
177 * @example
178 * var input = [1, 2, 3];
179 * // since the variance of a set is the mean squared
180 * // deviations, we can calculate that with sumNthPowerDeviations:
181 * sumNthPowerDeviations(input, 2) / input.length;
182 */
183function sumNthPowerDeviations(x, n) {
184 var meanValue = mean(x);
185 var sum = 0;
186 var tempValue;
187 var i;
188
189 // This is an optimization: when n is 2 (we're computing a number squared),
190 // multiplying the number by itself is significantly faster than using
191 // the Math.pow method.
192 if (n === 2) {
193 for (i = 0; i < x.length; i++) {
194 tempValue = x[i] - meanValue;
195 sum += tempValue * tempValue;
196 }
197 } else {
198 for (i = 0; i < x.length; i++) {
199 sum += Math.pow(x[i] - meanValue, n);
200 }
201 }
202
203 return sum;
204}
205
206/**
207 * The [variance](http://en.wikipedia.org/wiki/Variance)
208 * is the sum of squared deviations from the mean.
209 *
210 * This is an implementation of variance, not sample variance:
211 * see the `sampleVariance` method if you want a sample measure.
212 *
213 * @param {Array<number>} x a population of one or more data points
214 * @returns {number} variance: a value greater than or equal to zero.
215 * zero indicates that all values are identical.
216 * @throws {Error} if x's length is 0
217 * @example
218 * variance([1, 2, 3, 4, 5, 6]); // => 2.9166666666666665
219 */
220function variance(x) {
221 if (x.length === 0) {
222 throw new Error("variance requires at least one data point");
223 }
224
225 // Find the mean of squared deviations between the
226 // mean value and each value.
227 return sumNthPowerDeviations(x, 2) / x.length;
228}
229
230/**
231 * The [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
232 * is the square root of the variance. This is also known as the population
233 * standard deviation. It's useful for measuring the amount
234 * of variation or dispersion in a set of values.
235 *
236 * Standard deviation is only appropriate for full-population knowledge: for
237 * samples of a population, {@link sampleStandardDeviation} is
238 * more appropriate.
239 *
240 * @param {Array<number>} x input
241 * @returns {number} standard deviation
242 * @example
243 * variance([2, 4, 4, 4, 5, 5, 7, 9]); // => 4
244 * standardDeviation([2, 4, 4, 4, 5, 5, 7, 9]); // => 2
245 */
246function standardDeviation(x) {
247 if (x.length === 1) {
248 return 0;
249 }
250 var v = variance(x);
251 return Math.sqrt(v);
252}
253
254/**
255 * The [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
256 * value of data compared with a function `f`
257 * is the sum of the squared differences between the prediction
258 * and the actual value.
259 *
260 * @param {Array<Array<number>>} x input data: this should be doubly-nested
261 * @param {Function} func function called on `[i][0]` values within the dataset
262 * @returns {number} r-squared value
263 * @example
264 * var samples = [[0, 0], [1, 1]];
265 * var regressionLine = linearRegressionLine(linearRegression(samples));
266 * rSquared(samples, regressionLine); // = 1 this line is a perfect fit
267 */
268function rSquared(x, func) {
269 if (x.length < 2) {
270 return 1;
271 }
272
273 // Compute the average y value for the actual
274 // data set in order to compute the
275 // _total sum of squares_
276 var sum = 0;
277 for (var i = 0; i < x.length; i++) {
278 sum += x[i][1];
279 }
280 var average = sum / x.length;
281
282 // Compute the total sum of squares - the
283 // squared difference between each point
284 // and the average of all points.
285 var sumOfSquares = 0;
286 for (var j = 0; j < x.length; j++) {
287 sumOfSquares += Math.pow(average - x[j][1], 2);
288 }
289
290 // Finally estimate the error: the squared
291 // difference between the estimate and the actual data
292 // value at each point.
293 var err = 0;
294 for (var k = 0; k < x.length; k++) {
295 err += Math.pow(x[k][1] - func(x[k][0]), 2);
296 }
297
298 // As the error grows larger, its ratio to the
299 // sum of squares increases and the r squared
300 // value grows lower.
301 return 1 - err / sumOfSquares;
302}
303
304/**
305 * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
306 * that appears in a list the highest number of times.
307 * There can be multiple modes in a list: in the event of a tie, this
308 * algorithm will return the most recently seen mode.
309 *
310 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
311 * a method of finding a typical or central value of a set of numbers.
312 *
313 * This runs in `O(n)` because the input is sorted.
314 *
315 * @param {Array<number>} sorted a sample of one or more data points
316 * @returns {number} mode
317 * @throws {Error} if sorted is empty
318 * @example
319 * modeSorted([0, 0, 1]); // => 0
320 */
321function modeSorted(sorted) {
322 // Handle edge cases:
323 // The mode of an empty list is undefined
324 if (sorted.length === 0) {
325 throw new Error("mode requires at least one data point");
326 } else if (sorted.length === 1) {
327 return sorted[0];
328 }
329
330 // This assumes it is dealing with an array of size > 1, since size
331 // 0 and 1 are handled immediately. Hence it starts at index 1 in the
332 // array.
333 var last = sorted[0],
334 // store the mode as we find new modes
335 value = NaN,
336 // store how many times we've seen the mode
337 maxSeen = 0,
338 // how many times the current candidate for the mode
339 // has been seen
340 seenThis = 1;
341
342 // end at sorted.length + 1 to fix the case in which the mode is
343 // the highest number that occurs in the sequence. the last iteration
344 // compares sorted[i], which is undefined, to the highest number
345 // in the series
346 for (var i = 1; i < sorted.length + 1; i++) {
347 // we're seeing a new number pass by
348 if (sorted[i] !== last) {
349 // the last number is the new mode since we saw it more
350 // often than the old one
351 if (seenThis > maxSeen) {
352 maxSeen = seenThis;
353 value = last;
354 }
355 seenThis = 1;
356 last = sorted[i];
357 // if this isn't a new number, it's one more occurrence of
358 // the potential mode
359 } else {
360 seenThis++;
361 }
362 }
363 return value;
364}
365
366/**
367 * Sort an array of numbers by their numeric value, ensuring that the
368 * array is not changed in place.
369 *
370 * This is necessary because the default behavior of .sort
371 * in JavaScript is to sort arrays as string values
372 *
373 * [1, 10, 12, 102, 20].sort()
374 * // output
375 * [1, 10, 102, 12, 20]
376 *
377 * @param {Array<number>} x input array
378 * @return {Array<number>} sorted array
379 * @private
380 * @example
381 * numericSort([3, 2, 1]) // => [1, 2, 3]
382 */
383function numericSort(x) {
384 return (
385 x
386 // ensure the array is not changed in-place
387 .slice()
388 // comparator function that treats input as numeric
389 .sort(function (a, b) {
390 return a - b;
391 })
392 );
393}
394
395/**
396 * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
397 * that appears in a list the highest number of times.
398 * There can be multiple modes in a list: in the event of a tie, this
399 * algorithm will return the most recently seen mode.
400 *
401 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
402 * a method of finding a typical or central value of a set of numbers.
403 *
404 * This runs in `O(n log(n))` because it needs to sort the array internally
405 * before running an `O(n)` search to find the mode.
406 *
407 * @param {Array<number>} x input
408 * @returns {number} mode
409 * @example
410 * mode([0, 0, 1]); // => 0
411 */
412function mode(x) {
413 // Sorting the array lets us iterate through it below and be sure
414 // that every time we see a new number it's new and we'll never
415 // see the same number twice
416 return modeSorted(numericSort(x));
417}
418
419/* globals Map: false */
420
421/**
422 * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
423 * that appears in a list the highest number of times.
424 * There can be multiple modes in a list: in the event of a tie, this
425 * algorithm will return the most recently seen mode.
426 *
427 * modeFast uses a Map object to keep track of the mode, instead of the approach
428 * used with `mode`, a sorted array. As a result, it is faster
429 * than `mode` and supports any data type that can be compared with `==`.
430 * It also requires a
431 * [JavaScript environment with support for Map](https://kangax.github.io/compat-table/es6/#test-Map),
432 * and will throw an error if Map is not available.
433 *
434 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
435 * a method of finding a typical or central value of a set of numbers.
436 *
437 * @param {Array<*>} x a sample of one or more data points
438 * @returns {?*} mode
439 * @throws {ReferenceError} if the JavaScript environment doesn't support Map
440 * @throws {Error} if x is empty
441 * @example
442 * modeFast(['rabbits', 'rabbits', 'squirrels']); // => 'rabbits'
443 */
444function modeFast(x) {
445 // This index will reflect the incidence of different values, indexing
446 // them like
447 // { value: count }
448 var index = new Map();
449
450 // A running `mode` and the number of times it has been encountered.
451 var mode;
452 var modeCount = 0;
453
454 for (var i = 0; i < x.length; i++) {
455 var newCount = index.get(x[i]);
456 if (newCount === undefined) {
457 newCount = 1;
458 } else {
459 newCount++;
460 }
461 if (newCount > modeCount) {
462 mode = x[i];
463 modeCount = newCount;
464 }
465 index.set(x[i], newCount);
466 }
467
468 if (modeCount === 0) {
469 throw new Error("mode requires at last one data point");
470 }
471
472 return mode;
473}
474
475/**
476 * The min is the lowest number in the array.
477 * This runs in `O(n)`, linear time, with respect to the length of the array.
478 *
479 * @param {Array<number>} x sample of one or more data points
480 * @throws {Error} if the the length of x is less than one
481 * @returns {number} minimum value
482 * @example
483 * min([1, 5, -10, 100, 2]); // => -10
484 */
485function min(x) {
486 if (x.length === 0) {
487 throw new Error("min requires at least one data point");
488 }
489
490 var value = x[0];
491 for (var i = 1; i < x.length; i++) {
492 if (x[i] < value) {
493 value = x[i];
494 }
495 }
496 return value;
497}
498
499/**
500 * This computes the maximum number in an array.
501 *
502 * This runs in `O(n)`, linear time, with respect to the length of the array.
503 *
504 * @param {Array<number>} x sample of one or more data points
505 * @returns {number} maximum value
506 * @throws {Error} if the the length of x is less than one
507 * @example
508 * max([1, 2, 3, 4]);
509 * // => 4
510 */
511function max(x) {
512 if (x.length === 0) {
513 throw new Error("max requires at least one data point");
514 }
515
516 var value = x[0];
517 for (var i = 1; i < x.length; i++) {
518 if (x[i] > value) {
519 value = x[i];
520 }
521 }
522 return value;
523}
524
525/**
526 * This computes the minimum & maximum number in an array.
527 *
528 * This runs in `O(n)`, linear time, with respect to the length of the array.
529 *
530 * @param {Array<number>} x sample of one or more data points
531 * @returns {Array<number>} minimum & maximum value
532 * @throws {Error} if the the length of x is less than one
533 * @example
534 * extent([1, 2, 3, 4]);
535 * // => [1, 4]
536 */
537function extent(x) {
538 if (x.length === 0) {
539 throw new Error("extent requires at least one data point");
540 }
541
542 var min = x[0];
543 var max = x[0];
544 for (var i = 1; i < x.length; i++) {
545 if (x[i] > max) {
546 max = x[i];
547 }
548 if (x[i] < min) {
549 min = x[i];
550 }
551 }
552 return [min, max];
553}
554
555/**
556 * The minimum is the lowest number in the array. With a sorted array,
557 * the first element in the array is always the smallest, so this calculation
558 * can be done in one step, or constant time.
559 *
560 * @param {Array<number>} x input
561 * @returns {number} minimum value
562 * @example
563 * minSorted([-100, -10, 1, 2, 5]); // => -100
564 */
565function minSorted(x) {
566 return x[0];
567}
568
569/**
570 * The maximum is the highest number in the array. With a sorted array,
571 * the last element in the array is always the largest, so this calculation
572 * can be done in one step, or constant time.
573 *
574 * @param {Array<number>} x input
575 * @returns {number} maximum value
576 * @example
577 * maxSorted([-100, -10, 1, 2, 5]); // => 5
578 */
579function maxSorted(x) {
580 return x[x.length - 1];
581}
582
583/**
584 * The extent is the lowest & highest number in the array. With a sorted array,
585 * the first element in the array is always the lowest while the last element is always the largest, so this calculation
586 * can be done in one step, or constant time.
587 *
588 * @param {Array<number>} x input
589 * @returns {Array<number>} minimum & maximum value
590 * @example
591 * extentSorted([-100, -10, 1, 2, 5]); // => [-100, 5]
592 */
593function extentSorted(x) {
594 return [x[0], x[x.length - 1]];
595}
596
597/**
598 * The simple [sum](https://en.wikipedia.org/wiki/Summation) of an array
599 * is the result of adding all numbers together, starting from zero.
600 *
601 * This runs in `O(n)`, linear time, with respect to the length of the array.
602 *
603 * @param {Array<number>} x input
604 * @return {number} sum of all input numbers
605 * @example
606 * sumSimple([1, 2, 3]); // => 6
607 */
608function sumSimple(x) {
609 var value = 0;
610 for (var i = 0; i < x.length; i++) {
611 value += x[i];
612 }
613 return value;
614}
615
616/**
617 * The [product](https://en.wikipedia.org/wiki/Product_(mathematics)) of an array
618 * is the result of multiplying all numbers together, starting using one as the multiplicative identity.
619 *
620 * This runs in `O(n)`, linear time, with respect to the length of the array.
621 *
622 * @param {Array<number>} x input
623 * @return {number} product of all input numbers
624 * @example
625 * product([1, 2, 3, 4]); // => 24
626 */
627function product(x) {
628 var value = 1;
629 for (var i = 0; i < x.length; i++) {
630 value *= x[i];
631 }
632 return value;
633}
634
635/**
636 * This is the internal implementation of quantiles: when you know
637 * that the order is sorted, you don't need to re-sort it, and the computations
638 * are faster.
639 *
640 * @param {Array<number>} x sample of one or more data points
641 * @param {number} p desired quantile: a number between 0 to 1, inclusive
642 * @returns {number} quantile value
643 * @throws {Error} if p ix outside of the range from 0 to 1
644 * @throws {Error} if x is empty
645 * @example
646 * quantileSorted([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
647 */
648function quantileSorted(x, p) {
649 var idx = x.length * p;
650 if (x.length === 0) {
651 throw new Error("quantile requires at least one data point.");
652 } else if (p < 0 || p > 1) {
653 throw new Error("quantiles must be between 0 and 1");
654 } else if (p === 1) {
655 // If p is 1, directly return the last element
656 return x[x.length - 1];
657 } else if (p === 0) {
658 // If p is 0, directly return the first element
659 return x[0];
660 } else if (idx % 1 !== 0) {
661 // If p is not integer, return the next element in array
662 return x[Math.ceil(idx) - 1];
663 } else if (x.length % 2 === 0) {
664 // If the list has even-length, we'll take the average of this number
665 // and the next value, if there is one
666 return (x[idx - 1] + x[idx]) / 2;
667 } else {
668 // Finally, in the simple case of an integer value
669 // with an odd-length list, return the x value at the index.
670 return x[idx];
671 }
672}
673
674/**
675 * Rearrange items in `arr` so that all items in `[left, k]` range are the smallest.
676 * The `k`-th element will have the `(k - left + 1)`-th smallest value in `[left, right]`.
677 *
678 * Implements Floyd-Rivest selection algorithm https://en.wikipedia.org/wiki/Floyd-Rivest_algorithm
679 *
680 * @param {Array<number>} arr input array
681 * @param {number} k pivot index
682 * @param {number} [left] left index
683 * @param {number} [right] right index
684 * @returns {void} mutates input array
685 * @example
686 * var arr = [65, 28, 59, 33, 21, 56, 22, 95, 50, 12, 90, 53, 28, 77, 39];
687 * quickselect(arr, 8);
688 * // = [39, 28, 28, 33, 21, 12, 22, 50, 53, 56, 59, 65, 90, 77, 95]
689 */
690function quickselect(arr, k, left, right) {
691 left = left || 0;
692 right = right || arr.length - 1;
693
694 while (right > left) {
695 // 600 and 0.5 are arbitrary constants chosen in the original paper to minimize execution time
696 if (right - left > 600) {
697 var n = right - left + 1;
698 var m = k - left + 1;
699 var z = Math.log(n);
700 var s = 0.5 * Math.exp((2 * z) / 3);
701 var sd = 0.5 * Math.sqrt((z * s * (n - s)) / n);
702 if (m - n / 2 < 0) { sd *= -1; }
703 var newLeft = Math.max(left, Math.floor(k - (m * s) / n + sd));
704 var newRight = Math.min(
705 right,
706 Math.floor(k + ((n - m) * s) / n + sd)
707 );
708 quickselect(arr, k, newLeft, newRight);
709 }
710
711 var t = arr[k];
712 var i = left;
713 var j = right;
714
715 swap(arr, left, k);
716 if (arr[right] > t) { swap(arr, left, right); }
717
718 while (i < j) {
719 swap(arr, i, j);
720 i++;
721 j--;
722 while (arr[i] < t) { i++; }
723 while (arr[j] > t) { j--; }
724 }
725
726 if (arr[left] === t) { swap(arr, left, j); }
727 else {
728 j++;
729 swap(arr, j, right);
730 }
731
732 if (j <= k) { left = j + 1; }
733 if (k <= j) { right = j - 1; }
734 }
735}
736
737function swap(arr, i, j) {
738 var tmp = arr[i];
739 arr[i] = arr[j];
740 arr[j] = tmp;
741}
742
743/**
744 * The [quantile](https://en.wikipedia.org/wiki/Quantile):
745 * this is a population quantile, since we assume to know the entire
746 * dataset in this library. This is an implementation of the
747 * [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
748 * algorithm from wikipedia.
749 *
750 * Sample is a one-dimensional array of numbers,
751 * and p is either a decimal number from 0 to 1 or an array of decimal
752 * numbers from 0 to 1.
753 * In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing
754 * with decimal values.
755 * When p is an array, the result of the function is also an array containing the appropriate
756 * quantiles in input order
757 *
758 * @param {Array<number>} x sample of one or more numbers
759 * @param {Array<number> | number} p the desired quantile, as a number between 0 and 1
760 * @returns {number} quantile
761 * @example
762 * quantile([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
763 */
764function quantile(x, p) {
765 var copy = x.slice();
766
767 if (Array.isArray(p)) {
768 // rearrange elements so that each element corresponding to a requested
769 // quantile is on a place it would be if the array was fully sorted
770 multiQuantileSelect(copy, p);
771 // Initialize the result array
772 var results = [];
773 // For each requested quantile
774 for (var i = 0; i < p.length; i++) {
775 results[i] = quantileSorted(copy, p[i]);
776 }
777 return results;
778 } else {
779 var idx = quantileIndex(copy.length, p);
780 quantileSelect(copy, idx, 0, copy.length - 1);
781 return quantileSorted(copy, p);
782 }
783}
784
785function quantileSelect(arr, k, left, right) {
786 if (k % 1 === 0) {
787 quickselect(arr, k, left, right);
788 } else {
789 k = Math.floor(k);
790 quickselect(arr, k, left, right);
791 quickselect(arr, k + 1, k + 1, right);
792 }
793}
794
795function multiQuantileSelect(arr, p) {
796 var indices = [0];
797 for (var i = 0; i < p.length; i++) {
798 indices.push(quantileIndex(arr.length, p[i]));
799 }
800 indices.push(arr.length - 1);
801 indices.sort(compare);
802
803 var stack = [0, indices.length - 1];
804
805 while (stack.length) {
806 var r = Math.ceil(stack.pop());
807 var l = Math.floor(stack.pop());
808 if (r - l <= 1) { continue; }
809
810 var m = Math.floor((l + r) / 2);
811 quantileSelect(
812 arr,
813 indices[m],
814 Math.floor(indices[l]),
815 Math.ceil(indices[r])
816 );
817
818 stack.push(l, m, m, r);
819 }
820}
821
822function compare(a, b) {
823 return a - b;
824}
825
826function quantileIndex(len, p) {
827 var idx = len * p;
828 if (p === 1) {
829 // If p is 1, directly return the last index
830 return len - 1;
831 } else if (p === 0) {
832 // If p is 0, directly return the first index
833 return 0;
834 } else if (idx % 1 !== 0) {
835 // If index is not integer, return the next index in array
836 return Math.ceil(idx) - 1;
837 } else if (len % 2 === 0) {
838 // If the list has even-length, we'll return the middle of two indices
839 // around quantile to indicate that we need an average value of the two
840 return idx - 0.5;
841 } else {
842 // Finally, in the simple case of an integer index
843 // with an odd-length list, return the index
844 return idx;
845 }
846}
847
848/* eslint no-bitwise: 0 */
849
850/**
851 * This function returns the quantile in which one would find the given value in
852 * the given array. With a sorted array, leveraging binary search, we can find
853 * this information in logarithmic time.
854 *
855 * @param {Array<number>} x input
856 * @returns {number} value value
857 * @example
858 * quantileRankSorted([1, 2, 3, 4], 3); // => 0.75
859 * quantileRankSorted([1, 2, 3, 3, 4], 3); // => 0.7
860 * quantileRankSorted([1, 2, 3, 4], 6); // => 1
861 * quantileRankSorted([1, 2, 3, 3, 5], 4); // => 0.8
862 */
863function quantileRankSorted(x, value) {
864 // Value is lesser than any value in the array
865 if (value < x[0]) {
866 return 0;
867 }
868
869 // Value is greater than any value in the array
870 if (value > x[x.length - 1]) {
871 return 1;
872 }
873
874 var l = lowerBound(x, value);
875
876 // Value is not in the array
877 if (x[l] !== value) {
878 return l / x.length;
879 }
880
881 l++;
882
883 var u = upperBound(x, value);
884
885 // The value exists only once in the array
886 if (u === l) {
887 return l / x.length;
888 }
889
890 // Here, we are basically computing the mean of the range of indices
891 // containing our searched value. But, instead, of initializing an
892 // array and looping over it, there is a dedicated math formula that
893 // we apply below to get the result.
894 var r = u - l + 1;
895 var sum = (r * (u + l)) / 2;
896 var mean = sum / r;
897
898 return mean / x.length;
899}
900
901function lowerBound(x, value) {
902 var mid = 0;
903 var lo = 0;
904 var hi = x.length;
905
906 while (lo < hi) {
907 mid = (lo + hi) >>> 1;
908
909 if (value <= x[mid]) {
910 hi = mid;
911 } else {
912 lo = -~mid;
913 }
914 }
915
916 return lo;
917}
918
919function upperBound(x, value) {
920 var mid = 0;
921 var lo = 0;
922 var hi = x.length;
923
924 while (lo < hi) {
925 mid = (lo + hi) >>> 1;
926
927 if (value >= x[mid]) {
928 lo = -~mid;
929 } else {
930 hi = mid;
931 }
932 }
933
934 return lo;
935}
936
937/**
938 * This function returns the quantile in which one would find the given value in
939 * the given array. It will copy and sort your array before each run, so
940 * if you know your array is already sorted, you should use `quantileRankSorted`
941 * instead.
942 *
943 * @param {Array<number>} x input
944 * @returns {number} value value
945 * @example
946 * quantileRank([4, 3, 1, 2], 3); // => 0.75
947 * quantileRank([4, 3, 2, 3, 1], 3); // => 0.7
948 * quantileRank([2, 4, 1, 3], 6); // => 1
949 * quantileRank([5, 3, 1, 2, 3], 4); // => 0.8
950 */
951function quantileRank(x, value) {
952 // Cloning and sorting the array
953 var sortedCopy = numericSort(x);
954
955 return quantileRankSorted(sortedCopy, value);
956}
957
958/**
959 * The [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range) is
960 * a measure of statistical dispersion, or how scattered, spread, or
961 * concentrated a distribution is. It's computed as the difference between
962 * the third quartile and first quartile.
963 *
964 * @param {Array<number>} x sample of one or more numbers
965 * @returns {number} interquartile range: the span between lower and upper quartile,
966 * 0.25 and 0.75
967 * @example
968 * interquartileRange([0, 1, 2, 3]); // => 2
969 */
970function interquartileRange(x) {
971 // Interquartile range is the span between the upper quartile,
972 // at `0.75`, and lower quartile, `0.25`
973 var q1 = quantile(x, 0.75);
974 var q2 = quantile(x, 0.25);
975
976 if (typeof q1 === "number" && typeof q2 === "number") {
977 return q1 - q2;
978 }
979}
980
981/**
982 * The [median](http://en.wikipedia.org/wiki/Median) is
983 * the middle number of a list. This is often a good indicator of 'the middle'
984 * when there are outliers that skew the `mean()` value.
985 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
986 * a method of finding a typical or central value of a set of numbers.
987 *
988 * The median isn't necessarily one of the elements in the list: the value
989 * can be the average of two elements if the list has an even length
990 * and the two central values are different.
991 *
992 * @param {Array<number>} x input
993 * @returns {number} median value
994 * @example
995 * median([10, 2, 5, 100, 2, 1]); // => 3.5
996 */
997function median(x) {
998 return +quantile(x, 0.5);
999}
1000
1001/**
1002 * The [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation) is
1003 * a robust measure of statistical
1004 * dispersion. It is more resilient to outliers than the standard deviation.
1005 *
1006 * @param {Array<number>} x input array
1007 * @returns {number} median absolute deviation
1008 * @example
1009 * medianAbsoluteDeviation([1, 1, 2, 2, 4, 6, 9]); // => 1
1010 */
1011function medianAbsoluteDeviation(x) {
1012 var medianValue = median(x);
1013 var medianAbsoluteDeviations = [];
1014
1015 // Make a list of absolute deviations from the median
1016 for (var i = 0; i < x.length; i++) {
1017 medianAbsoluteDeviations.push(Math.abs(x[i] - medianValue));
1018 }
1019
1020 // Find the median value of that list
1021 return median(medianAbsoluteDeviations);
1022}
1023
1024/**
1025 * Split an array into chunks of a specified size. This function
1026 * has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php)
1027 * function, and thus will insert smaller-sized chunks at the end if
1028 * the input size is not divisible by the chunk size.
1029 *
1030 * `x` is expected to be an array, and `chunkSize` a number.
1031 * The `x` array can contain any kind of data.
1032 *
1033 * @param {Array} x a sample
1034 * @param {number} chunkSize size of each output array. must be a positive integer
1035 * @returns {Array<Array>} a chunked array
1036 * @throws {Error} if chunk size is less than 1 or not an integer
1037 * @example
1038 * chunk([1, 2, 3, 4, 5, 6], 2);
1039 * // => [[1, 2], [3, 4], [5, 6]]
1040 */
1041function chunk(x, chunkSize) {
1042 // a list of result chunks, as arrays in an array
1043 var output = [];
1044
1045 // `chunkSize` must be zero or higher - otherwise the loop below,
1046 // in which we call `start += chunkSize`, will loop infinitely.
1047 // So, we'll detect and throw in that case to indicate
1048 // invalid input.
1049 if (chunkSize < 1) {
1050 throw new Error("chunk size must be a positive number");
1051 }
1052
1053 if (Math.floor(chunkSize) !== chunkSize) {
1054 throw new Error("chunk size must be an integer");
1055 }
1056
1057 // `start` is the index at which `.slice` will start selecting
1058 // new array elements
1059 for (var start = 0; start < x.length; start += chunkSize) {
1060 // for each chunk, slice that part of the array and add it
1061 // to the output. The `.slice` function does not change
1062 // the original array.
1063 output.push(x.slice(start, start + chunkSize));
1064 }
1065 return output;
1066}
1067
1068/**
1069 * Sampling with replacement is a type of sampling that allows the same
1070 * item to be picked out of a population more than once.
1071 *
1072 * @param {Array<*>} x an array of any kind of value
1073 * @param {number} n count of how many elements to take
1074 * @param {Function} [randomSource=Math.random] an optional entropy source that
1075 * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
1076 * @return {Array} n sampled items from the population
1077 * @example
1078 * var values = [1, 2, 3, 4];
1079 * sampleWithReplacement(values, 2); // returns 2 random values, like [2, 4];
1080 */
1081function sampleWithReplacement(x, n, randomSource) {
1082 if (x.length === 0) {
1083 return [];
1084 }
1085
1086 // a custom random number source can be provided if you want to use
1087 // a fixed seed or another random number generator, like
1088 // [random-js](https://www.npmjs.org/package/random-js)
1089 randomSource = randomSource || Math.random;
1090
1091 var length = x.length;
1092 var sample = [];
1093
1094 for (var i = 0; i < n; i++) {
1095 var index = Math.floor(randomSource() * length);
1096
1097 sample.push(x[index]);
1098 }
1099
1100 return sample;
1101}
1102
1103/**
1104 * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
1105 * in-place - which means that it **will change the order of the original
1106 * array by reference**.
1107 *
1108 * This is an algorithm that generates a random [permutation](https://en.wikipedia.org/wiki/Permutation)
1109 * of a set.
1110 *
1111 * @param {Array} x sample of one or more numbers
1112 * @param {Function} [randomSource=Math.random] an optional entropy source that
1113 * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
1114 * @returns {Array} x
1115 * @example
1116 * var x = [1, 2, 3, 4];
1117 * shuffleInPlace(x);
1118 * // x is shuffled to a value like [2, 1, 4, 3]
1119 */
1120function shuffleInPlace(x, randomSource) {
1121 // a custom random number source can be provided if you want to use
1122 // a fixed seed or another random number generator, like
1123 // [random-js](https://www.npmjs.org/package/random-js)
1124 randomSource = randomSource || Math.random;
1125
1126 // store the current length of the x to determine
1127 // when no elements remain to shuffle.
1128 var length = x.length;
1129
1130 // temporary is used to hold an item when it is being
1131 // swapped between indices.
1132 var temporary;
1133
1134 // The index to swap at each stage.
1135 var index;
1136
1137 // While there are still items to shuffle
1138 while (length > 0) {
1139 // chose a random index within the subset of the array
1140 // that is not yet shuffled
1141 index = Math.floor(randomSource() * length--);
1142
1143 // store the value that we'll move temporarily
1144 temporary = x[length];
1145
1146 // swap the value at `x[length]` with `x[index]`
1147 x[length] = x[index];
1148 x[index] = temporary;
1149 }
1150
1151 return x;
1152}
1153
1154/**
1155 * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
1156 * is a fast way to create a random permutation of a finite set. This is
1157 * a function around `shuffle_in_place` that adds the guarantee that
1158 * it will not modify its input.
1159 *
1160 * @param {Array} x sample of 0 or more numbers
1161 * @param {Function} [randomSource=Math.random] an optional entropy source that
1162 * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
1163 * @return {Array} shuffled version of input
1164 * @example
1165 * var shuffled = shuffle([1, 2, 3, 4]);
1166 * shuffled; // = [2, 3, 1, 4] or any other random permutation
1167 */
1168function shuffle(x, randomSource) {
1169 // slice the original array so that it is not modified
1170 var sample = x.slice();
1171
1172 // and then shuffle that shallow-copied array, in place
1173 return shuffleInPlace(sample, randomSource);
1174}
1175
1176/**
1177 * Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample)
1178 * from a given array of `n` elements.
1179 *
1180 * The sampled values will be in any order, not necessarily the order
1181 * they appear in the input.
1182 *
1183 * @param {Array<any>} x input array. can contain any type
1184 * @param {number} n count of how many elements to take
1185 * @param {Function} [randomSource=Math.random] an optional entropy source that
1186 * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
1187 * @return {Array} subset of n elements in original array
1188 *
1189 * @example
1190 * var values = [1, 2, 4, 5, 6, 7, 8, 9];
1191 * sample(values, 3); // returns 3 random values, like [2, 5, 8];
1192 */
1193function sample(x, n, randomSource) {
1194 // shuffle the original array using a fisher-yates shuffle
1195 var shuffled = shuffle(x, randomSource);
1196
1197 // and then return a subset of it - the first `n` elements.
1198 return shuffled.slice(0, n);
1199}
1200
1201/**
1202 * Create a new column x row matrix.
1203 *
1204 * @private
1205 * @param {number} columns
1206 * @param {number} rows
1207 * @return {Array<Array<number>>} matrix
1208 * @example
1209 * makeMatrix(10, 10);
1210 */
1211function makeMatrix(columns, rows) {
1212 var matrix = [];
1213 for (var i = 0; i < columns; i++) {
1214 var column = [];
1215 for (var j = 0; j < rows; j++) {
1216 column.push(0);
1217 }
1218 matrix.push(column);
1219 }
1220 return matrix;
1221}
1222
1223/**
1224 * For a sorted input, counting the number of unique values
1225 * is possible in constant time and constant memory. This is
1226 * a simple implementation of the algorithm.
1227 *
1228 * Values are compared with `===`, so objects and non-primitive objects
1229 * are not handled in any special way.
1230 *
1231 * @param {Array<*>} x an array of any kind of value
1232 * @returns {number} count of unique values
1233 * @example
1234 * uniqueCountSorted([1, 2, 3]); // => 3
1235 * uniqueCountSorted([1, 1, 1]); // => 1
1236 */
1237function uniqueCountSorted(x) {
1238 var uniqueValueCount = 0,
1239 lastSeenValue;
1240 for (var i = 0; i < x.length; i++) {
1241 if (i === 0 || x[i] !== lastSeenValue) {
1242 lastSeenValue = x[i];
1243 uniqueValueCount++;
1244 }
1245 }
1246 return uniqueValueCount;
1247}
1248
1249/**
1250 * Generates incrementally computed values based on the sums and sums of
1251 * squares for the data array
1252 *
1253 * @private
1254 * @param {number} j
1255 * @param {number} i
1256 * @param {Array<number>} sums
1257 * @param {Array<number>} sumsOfSquares
1258 * @return {number}
1259 * @example
1260 * ssq(0, 1, [-1, 0, 2], [1, 1, 5]);
1261 */
1262function ssq(j, i, sums, sumsOfSquares) {
1263 var sji; // s(j, i)
1264 if (j > 0) {
1265 var muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i)
1266 sji =
1267 sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji;
1268 } else {
1269 sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1);
1270 }
1271 if (sji < 0) {
1272 return 0;
1273 }
1274 return sji;
1275}
1276
1277/**
1278 * Function that recursively divides and conquers computations
1279 * for cluster j
1280 *
1281 * @private
1282 * @param {number} iMin Minimum index in cluster to be computed
1283 * @param {number} iMax Maximum index in cluster to be computed
1284 * @param {number} cluster Index of the cluster currently being computed
1285 * @param {Array<Array<number>>} matrix
1286 * @param {Array<Array<number>>} backtrackMatrix
1287 * @param {Array<number>} sums
1288 * @param {Array<number>} sumsOfSquares
1289 */
1290function fillMatrixColumn(
1291 iMin,
1292 iMax,
1293 cluster,
1294 matrix,
1295 backtrackMatrix,
1296 sums,
1297 sumsOfSquares
1298) {
1299 if (iMin > iMax) {
1300 return;
1301 }
1302
1303 // Start at midpoint between iMin and iMax
1304 var i = Math.floor((iMin + iMax) / 2);
1305
1306 matrix[cluster][i] = matrix[cluster - 1][i - 1];
1307 backtrackMatrix[cluster][i] = i;
1308
1309 var jlow = cluster; // the lower end for j
1310
1311 if (iMin > cluster) {
1312 jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0);
1313 }
1314 jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
1315
1316 var jhigh = i - 1; // the upper end for j
1317 if (iMax < matrix[0].length - 1) {
1318 jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0);
1319 }
1320
1321 var sji;
1322 var sjlowi;
1323 var ssqjlow;
1324 var ssqj;
1325 for (var j = jhigh; j >= jlow; --j) {
1326 sji = ssq(j, i, sums, sumsOfSquares);
1327
1328 if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) {
1329 break;
1330 }
1331
1332 // Examine the lower bound of the cluster border
1333 sjlowi = ssq(jlow, i, sums, sumsOfSquares);
1334
1335 ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
1336
1337 if (ssqjlow < matrix[cluster][i]) {
1338 // Shrink the lower bound
1339 matrix[cluster][i] = ssqjlow;
1340 backtrackMatrix[cluster][i] = jlow;
1341 }
1342 jlow++;
1343
1344 ssqj = sji + matrix[cluster - 1][j - 1];
1345 if (ssqj < matrix[cluster][i]) {
1346 matrix[cluster][i] = ssqj;
1347 backtrackMatrix[cluster][i] = j;
1348 }
1349 }
1350
1351 fillMatrixColumn(
1352 iMin,
1353 i - 1,
1354 cluster,
1355 matrix,
1356 backtrackMatrix,
1357 sums,
1358 sumsOfSquares
1359 );
1360 fillMatrixColumn(
1361 i + 1,
1362 iMax,
1363 cluster,
1364 matrix,
1365 backtrackMatrix,
1366 sums,
1367 sumsOfSquares
1368 );
1369}
1370
1371/**
1372 * Initializes the main matrices used in Ckmeans and kicks
1373 * off the divide and conquer cluster computation strategy
1374 *
1375 * @private
1376 * @param {Array<number>} data sorted array of values
1377 * @param {Array<Array<number>>} matrix
1378 * @param {Array<Array<number>>} backtrackMatrix
1379 */
1380function fillMatrices(data, matrix, backtrackMatrix) {
1381 var nValues = matrix[0].length;
1382
1383 // Shift values by the median to improve numeric stability
1384 var shift = data[Math.floor(nValues / 2)];
1385
1386 // Cumulative sum and cumulative sum of squares for all values in data array
1387 var sums = [];
1388 var sumsOfSquares = [];
1389
1390 // Initialize first column in matrix & backtrackMatrix
1391 for (var i = 0, shiftedValue = (void 0); i < nValues; ++i) {
1392 shiftedValue = data[i] - shift;
1393 if (i === 0) {
1394 sums.push(shiftedValue);
1395 sumsOfSquares.push(shiftedValue * shiftedValue);
1396 } else {
1397 sums.push(sums[i - 1] + shiftedValue);
1398 sumsOfSquares.push(
1399 sumsOfSquares[i - 1] + shiftedValue * shiftedValue
1400 );
1401 }
1402
1403 // Initialize for cluster = 0
1404 matrix[0][i] = ssq(0, i, sums, sumsOfSquares);
1405 backtrackMatrix[0][i] = 0;
1406 }
1407
1408 // Initialize the rest of the columns
1409 var iMin;
1410 for (var cluster = 1; cluster < matrix.length; ++cluster) {
1411 if (cluster < matrix.length - 1) {
1412 iMin = cluster;
1413 } else {
1414 // No need to compute matrix[K-1][0] ... matrix[K-1][N-2]
1415 iMin = nValues - 1;
1416 }
1417
1418 fillMatrixColumn(
1419 iMin,
1420 nValues - 1,
1421 cluster,
1422 matrix,
1423 backtrackMatrix,
1424 sums,
1425 sumsOfSquares
1426 );
1427 }
1428}
1429
1430/**
1431 * Ckmeans clustering is an improvement on heuristic-based clustering
1432 * approaches like Jenks. The algorithm was developed in
1433 * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf)
1434 * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach
1435 * to the problem of clustering numeric data into groups with the least
1436 * within-group sum-of-squared-deviations.
1437 *
1438 * Minimizing the difference within groups - what Wang & Song refer to as
1439 * `withinss`, or within sum-of-squares, means that groups are optimally
1440 * homogenous within and the data is split into representative groups.
1441 * This is very useful for visualization, where you may want to represent
1442 * a continuous variable in discrete color or style groups. This function
1443 * can provide groups that emphasize differences between data.
1444 *
1445 * Being a dynamic approach, this algorithm is based on two matrices that
1446 * store incrementally-computed values for squared deviations and backtracking
1447 * indexes.
1448 *
1449 * This implementation is based on Ckmeans 3.4.6, which introduced a new divide
1450 * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)).
1451 *
1452 * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html),
1453 * this implementation does not include any code to automatically determine
1454 * the optimal number of clusters: this information needs to be explicitly
1455 * provided.
1456 *
1457 * ### References
1458 * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic
1459 * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859
1460 *
1461 * from The R Journal Vol. 3/2, December 2011
1462 * @param {Array<number>} x input data, as an array of number values
1463 * @param {number} nClusters number of desired classes. This cannot be
1464 * greater than the number of values in the data array.
1465 * @returns {Array<Array<number>>} clustered input
1466 * @throws {Error} if the number of requested clusters is higher than the size of the data
1467 * @example
1468 * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3);
1469 * // The input, clustered into groups of similar numbers.
1470 * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]);
1471 */
1472function ckmeans(x, nClusters) {
1473 if (nClusters > x.length) {
1474 throw new Error(
1475 "cannot generate more classes than there are data values"
1476 );
1477 }
1478
1479 var sorted = numericSort(x);
1480 // we'll use this as the maximum number of clusters
1481 var uniqueCount = uniqueCountSorted(sorted);
1482
1483 // if all of the input values are identical, there's one cluster
1484 // with all of the input in it.
1485 if (uniqueCount === 1) {
1486 return [sorted];
1487 }
1488
1489 // named 'S' originally
1490 var matrix = makeMatrix(nClusters, sorted.length);
1491 // named 'J' originally
1492 var backtrackMatrix = makeMatrix(nClusters, sorted.length);
1493
1494 // This is a dynamic programming way to solve the problem of minimizing
1495 // within-cluster sum of squares. It's similar to linear regression
1496 // in this way, and this calculation incrementally computes the
1497 // sum of squares that are later read.
1498 fillMatrices(sorted, matrix, backtrackMatrix);
1499
1500 // The real work of Ckmeans clustering happens in the matrix generation:
1501 // the generated matrices encode all possible clustering combinations, and
1502 // once they're generated we can solve for the best clustering groups
1503 // very quickly.
1504 var clusters = [];
1505 var clusterRight = backtrackMatrix[0].length - 1;
1506
1507 // Backtrack the clusters from the dynamic programming matrix. This
1508 // starts at the bottom-right corner of the matrix (if the top-left is 0, 0),
1509 // and moves the cluster target with the loop.
1510 for (var cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) {
1511 var clusterLeft = backtrackMatrix[cluster][clusterRight];
1512
1513 // fill the cluster from the sorted input by taking a slice of the
1514 // array. the backtrack matrix makes this easy - it stores the
1515 // indexes where the cluster should start and end.
1516 clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
1517
1518 if (cluster > 0) {
1519 clusterRight = clusterLeft - 1;
1520 }
1521 }
1522
1523 return clusters;
1524}
1525
1526/**
1527 * Given an array of x, this will find the extent of the
1528 * x and return an array of breaks that can be used
1529 * to categorize the x into a number of classes. The
1530 * returned array will always be 1 longer than the number of
1531 * classes because it includes the minimum value.
1532 *
1533 * @param {Array<number>} x an array of number values
1534 * @param {number} nClasses number of desired classes
1535 * @returns {Array<number>} array of class break positions
1536 * @example
1537 * equalIntervalBreaks([1, 2, 3, 4, 5, 6], 4); // => [1, 2.25, 3.5, 4.75, 6]
1538 */
1539function equalIntervalBreaks(x, nClasses) {
1540 if (x.length < 2) {
1541 return x;
1542 }
1543
1544 var theMin = min(x);
1545 var theMax = max(x);
1546
1547 // the first break will always be the minimum value
1548 // in the xset
1549 var breaks = [theMin];
1550
1551 // The size of each break is the full range of the x
1552 // divided by the number of classes requested
1553 var breakSize = (theMax - theMin) / nClasses;
1554
1555 // In the case of nClasses = 1, this loop won't run
1556 // and the returned breaks will be [min, max]
1557 for (var i = 1; i < nClasses; i++) {
1558 breaks.push(breaks[0] + breakSize * i);
1559 }
1560
1561 // the last break will always be the
1562 // maximum.
1563 breaks.push(theMax);
1564
1565 return breaks;
1566}
1567
1568/**
1569 * [Sample covariance](https://en.wikipedia.org/wiki/Sample_mean_and_covariance) of two datasets:
1570 * how much do the two datasets move together?
1571 * x and y are two datasets, represented as arrays of numbers.
1572 *
1573 * @param {Array<number>} x a sample of two or more data points
1574 * @param {Array<number>} y a sample of two or more data points
1575 * @throws {Error} if x and y do not have equal lengths
1576 * @throws {Error} if x or y have length of one or less
1577 * @returns {number} sample covariance
1578 * @example
1579 * sampleCovariance([1, 2, 3, 4, 5, 6], [6, 5, 4, 3, 2, 1]); // => -3.5
1580 */
1581function sampleCovariance(x, y) {
1582 // The two datasets must have the same length which must be more than 1
1583 if (x.length !== y.length) {
1584 throw new Error("sampleCovariance requires samples with equal lengths");
1585 }
1586
1587 if (x.length < 2) {
1588 throw new Error(
1589 "sampleCovariance requires at least two data points in each sample"
1590 );
1591 }
1592
1593 // determine the mean of each dataset so that we can judge each
1594 // value of the dataset fairly as the difference from the mean. this
1595 // way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
1596 // does not suffer because of the difference in absolute values
1597 var xmean = mean(x);
1598 var ymean = mean(y);
1599 var sum = 0;
1600
1601 // for each pair of values, the covariance increases when their
1602 // difference from the mean is associated - if both are well above
1603 // or if both are well below
1604 // the mean, the covariance increases significantly.
1605 for (var i = 0; i < x.length; i++) {
1606 sum += (x[i] - xmean) * (y[i] - ymean);
1607 }
1608
1609 // this is Bessels' Correction: an adjustment made to sample statistics
1610 // that allows for the reduced degree of freedom entailed in calculating
1611 // values from samples rather than complete populations.
1612 var besselsCorrection = x.length - 1;
1613
1614 // the covariance is weighted by the length of the datasets.
1615 return sum / besselsCorrection;
1616}
1617
1618/**
1619 * The [sample variance](https://en.wikipedia.org/wiki/Variance#Sample_variance)
1620 * is the sum of squared deviations from the mean. The sample variance
1621 * is distinguished from the variance by the usage of [Bessel's Correction](https://en.wikipedia.org/wiki/Bessel's_correction):
1622 * instead of dividing the sum of squared deviations by the length of the input,
1623 * it is divided by the length minus one. This corrects the bias in estimating
1624 * a value from a set that you don't know if full.
1625 *
1626 * References:
1627 * * [Wolfram MathWorld on Sample Variance](http://mathworld.wolfram.com/SampleVariance.html)
1628 *
1629 * @param {Array<number>} x a sample of two or more data points
1630 * @throws {Error} if the length of x is less than 2
1631 * @return {number} sample variance
1632 * @example
1633 * sampleVariance([1, 2, 3, 4, 5]); // => 2.5
1634 */
1635function sampleVariance(x) {
1636 if (x.length < 2) {
1637 throw new Error("sampleVariance requires at least two data points");
1638 }
1639
1640 var sumSquaredDeviationsValue = sumNthPowerDeviations(x, 2);
1641
1642 // this is Bessels' Correction: an adjustment made to sample statistics
1643 // that allows for the reduced degree of freedom entailed in calculating
1644 // values from samples rather than complete populations.
1645 var besselsCorrection = x.length - 1;
1646
1647 // Find the mean value of that list
1648 return sumSquaredDeviationsValue / besselsCorrection;
1649}
1650
1651/**
1652 * The [sample standard deviation](http://en.wikipedia.org/wiki/Standard_deviation#Sample_standard_deviation)
1653 * is the square root of the sample variance.
1654 *
1655 * @param {Array<number>} x input array
1656 * @returns {number} sample standard deviation
1657 * @example
1658 * sampleStandardDeviation([2, 4, 4, 4, 5, 5, 7, 9]).toFixed(2);
1659 * // => '2.14'
1660 */
1661function sampleStandardDeviation(x) {
1662 var sampleVarianceX = sampleVariance(x);
1663 return Math.sqrt(sampleVarianceX);
1664}
1665
1666/**
1667 * The [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence) is
1668 * a measure of how correlated two datasets are, between -1 and 1
1669 *
1670 * @param {Array<number>} x first input
1671 * @param {Array<number>} y second input
1672 * @returns {number} sample correlation
1673 * @example
1674 * sampleCorrelation([1, 2, 3, 4, 5, 6], [2, 2, 3, 4, 5, 60]).toFixed(2);
1675 * // => '0.69'
1676 */
1677function sampleCorrelation(x, y) {
1678 var cov = sampleCovariance(x, y);
1679 var xstd = sampleStandardDeviation(x);
1680 var ystd = sampleStandardDeviation(y);
1681
1682 return cov / xstd / ystd;
1683}
1684
1685/**
1686 * [Skewness](http://en.wikipedia.org/wiki/Skewness) is
1687 * a measure of the extent to which a probability distribution of a
1688 * real-valued random variable "leans" to one side of the mean.
1689 * The skewness value can be positive or negative, or even undefined.
1690 *
1691 * Implementation is based on the adjusted Fisher-Pearson standardized
1692 * moment coefficient, which is the version found in Excel and several
1693 * statistical packages including Minitab, SAS and SPSS.
1694 *
1695 * @since 4.1.0
1696 * @param {Array<number>} x a sample of 3 or more data points
1697 * @returns {number} sample skewness
1698 * @throws {Error} if x has length less than 3
1699 * @example
1700 * sampleSkewness([2, 4, 6, 3, 1]); // => 0.590128656384365
1701 */
1702function sampleSkewness(x) {
1703 if (x.length < 3) {
1704 throw new Error("sampleSkewness requires at least three data points");
1705 }
1706
1707 var meanValue = mean(x);
1708 var tempValue;
1709 var sumSquaredDeviations = 0;
1710 var sumCubedDeviations = 0;
1711
1712 for (var i = 0; i < x.length; i++) {
1713 tempValue = x[i] - meanValue;
1714 sumSquaredDeviations += tempValue * tempValue;
1715 sumCubedDeviations += tempValue * tempValue * tempValue;
1716 }
1717
1718 // this is Bessels' Correction: an adjustment made to sample statistics
1719 // that allows for the reduced degree of freedom entailed in calculating
1720 // values from samples rather than complete populations.
1721 var besselsCorrection = x.length - 1;
1722
1723 // Find the mean value of that list
1724 var theSampleStandardDeviation = Math.sqrt(
1725 sumSquaredDeviations / besselsCorrection
1726 );
1727
1728 var n = x.length;
1729 var cubedS = Math.pow(theSampleStandardDeviation, 3);
1730
1731 return (n * sumCubedDeviations) / ((n - 1) * (n - 2) * cubedS);
1732}
1733
1734/**
1735 * [Kurtosis](http://en.wikipedia.org/wiki/Kurtosis) is
1736 * a measure of the heaviness of a distribution's tails relative to its
1737 * variance. The kurtosis value can be positive or negative, or even undefined.
1738 *
1739 * Implementation is based on Fisher's excess kurtosis definition and uses
1740 * unbiased moment estimators. This is the version found in Excel and available
1741 * in several statistical packages, including SAS and SciPy.
1742 *
1743 * @param {Array<number>} x a sample of 4 or more data points
1744 * @returns {number} sample kurtosis
1745 * @throws {Error} if x has length less than 4
1746 * @example
1747 * sampleKurtosis([1, 2, 2, 3, 5]); // => 1.4555765595463122
1748 */
1749function sampleKurtosis(x) {
1750 var n = x.length;
1751
1752 if (n < 4) {
1753 throw new Error("sampleKurtosis requires at least four data points");
1754 }
1755
1756 var meanValue = mean(x);
1757 var tempValue;
1758 var secondCentralMoment = 0;
1759 var fourthCentralMoment = 0;
1760
1761 for (var i = 0; i < n; i++) {
1762 tempValue = x[i] - meanValue;
1763 secondCentralMoment += tempValue * tempValue;
1764 fourthCentralMoment += tempValue * tempValue * tempValue * tempValue;
1765 }
1766
1767 return (
1768 ((n - 1) / ((n - 2) * (n - 3))) *
1769 ((n * (n + 1) * fourthCentralMoment) /
1770 (secondCentralMoment * secondCentralMoment) -
1771 3 * (n - 1))
1772 );
1773}
1774
1775/**
1776 * Implementation of [Heap's Algorithm](https://en.wikipedia.org/wiki/Heap%27s_algorithm)
1777 * for generating permutations.
1778 *
1779 * @param {Array} elements any type of data
1780 * @returns {Array<Array>} array of permutations
1781 */
1782function permutationsHeap(elements) {
1783 var indexes = new Array(elements.length);
1784 var permutations = [elements.slice()];
1785
1786 for (var i = 0; i < elements.length; i++) {
1787 indexes[i] = 0;
1788 }
1789
1790 for (var i$1 = 0; i$1 < elements.length; ) {
1791 if (indexes[i$1] < i$1) {
1792 // At odd indexes, swap from indexes[i] instead
1793 // of from the beginning of the array
1794 var swapFrom = 0;
1795 if (i$1 % 2 !== 0) {
1796 swapFrom = indexes[i$1];
1797 }
1798
1799 // swap between swapFrom and i, using
1800 // a temporary variable as storage.
1801 var temp = elements[swapFrom];
1802 elements[swapFrom] = elements[i$1];
1803 elements[i$1] = temp;
1804
1805 permutations.push(elements.slice());
1806 indexes[i$1]++;
1807 i$1 = 0;
1808 } else {
1809 indexes[i$1] = 0;
1810 i$1++;
1811 }
1812 }
1813
1814 return permutations;
1815}
1816
1817/**
1818 * Implementation of Combinations
1819 * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
1820 * https://en.wikipedia.org/wiki/Combination
1821 * @param {Array} x any type of data
1822 * @param {int} k the number of objects in each group (without replacement)
1823 * @returns {Array<Array>} array of permutations
1824 * @example
1825 * combinations([1, 2, 3], 2); // => [[1,2], [1,3], [2,3]]
1826 */
1827
1828function combinations(x, k) {
1829 var i;
1830 var subI;
1831 var combinationList = [];
1832 var subsetCombinations;
1833 var next;
1834
1835 for (i = 0; i < x.length; i++) {
1836 if (k === 1) {
1837 combinationList.push([x[i]]);
1838 } else {
1839 subsetCombinations = combinations(x.slice(i + 1, x.length), k - 1);
1840 for (subI = 0; subI < subsetCombinations.length; subI++) {
1841 next = subsetCombinations[subI];
1842 next.unshift(x[i]);
1843 combinationList.push(next);
1844 }
1845 }
1846 }
1847 return combinationList;
1848}
1849
1850/**
1851 * Implementation of [Combinations](https://en.wikipedia.org/wiki/Combination) with replacement
1852 * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
1853 * 'With replacement' means that a given element can be chosen multiple times.
1854 * Unlike permutation, order doesn't matter for combinations.
1855 *
1856 * @param {Array} x any type of data
1857 * @param {int} k the number of objects in each group (without replacement)
1858 * @returns {Array<Array>} array of permutations
1859 * @example
1860 * combinationsReplacement([1, 2], 2); // => [[1, 1], [1, 2], [2, 2]]
1861 */
1862function combinationsReplacement(x, k) {
1863 var combinationList = [];
1864
1865 for (var i = 0; i < x.length; i++) {
1866 if (k === 1) {
1867 // If we're requested to find only one element, we don't need
1868 // to recurse: just push `x[i]` onto the list of combinations.
1869 combinationList.push([x[i]]);
1870 } else {
1871 // Otherwise, recursively find combinations, given `k - 1`. Note that
1872 // we request `k - 1`, so if you were looking for k=3 combinations, we're
1873 // requesting k=2. This -1 gets reversed in the for loop right after this
1874 // code, since we concatenate `x[i]` onto the selected combinations,
1875 // bringing `k` back up to your requested level.
1876 // This recursion may go many levels deep, since it only stops once
1877 // k=1.
1878 var subsetCombinations = combinationsReplacement(
1879 x.slice(i, x.length),
1880 k - 1
1881 );
1882
1883 for (var j = 0; j < subsetCombinations.length; j++) {
1884 combinationList.push([x[i]].concat(subsetCombinations[j]));
1885 }
1886 }
1887 }
1888
1889 return combinationList;
1890}
1891
1892/**
1893 * When adding a new value to a list, one does not have to necessary
1894 * recompute the mean of the list in linear time. They can instead use
1895 * this function to compute the new mean by providing the current mean,
1896 * the number of elements in the list that produced it and the new
1897 * value to add.
1898 *
1899 * @since 2.5.0
1900 * @param {number} mean current mean
1901 * @param {number} n number of items in the list
1902 * @param {number} newValue the added value
1903 * @returns {number} the new mean
1904 *
1905 * @example
1906 * addToMean(14, 5, 53); // => 20.5
1907 */
1908function addToMean(mean, n, newValue) {
1909 return mean + (newValue - mean) / (n + 1);
1910}
1911
1912/**
1913 * When combining two lists of values for which one already knows the means,
1914 * one does not have to necessary recompute the mean of the combined lists in
1915 * linear time. They can instead use this function to compute the combined
1916 * mean by providing the mean & number of values of the first list and the mean
1917 * & number of values of the second list.
1918 *
1919 * @since 3.0.0
1920 * @param {number} mean1 mean of the first list
1921 * @param {number} n1 number of items in the first list
1922 * @param {number} mean2 mean of the second list
1923 * @param {number} n2 number of items in the second list
1924 * @returns {number} the combined mean
1925 *
1926 * @example
1927 * combineMeans(5, 3, 4, 3); // => 4.5
1928 */
1929function combineMeans(mean1, n1, mean2, n2) {
1930 return (mean1 * n1 + mean2 * n2) / (n1 + n2);
1931}
1932
1933/**
1934 * When combining two lists of values for which one already knows the variances,
1935 * one does not have to necessary recompute the variance of the combined lists
1936 * in linear time. They can instead use this function to compute the combined
1937 * variance by providing the variance, mean & number of values of the first list
1938 * and the variance, mean & number of values of the second list.
1939 *
1940 * @since 3.0.0
1941 * @param {number} variance1 variance of the first list
1942 * @param {number} mean1 mean of the first list
1943 * @param {number} n1 number of items in the first list
1944 * @param {number} variance2 variance of the second list
1945 * @param {number} mean2 mean of the second list
1946 * @param {number} n2 number of items in the second list
1947 * @returns {number} the combined mean
1948 *
1949 * @example
1950 * combineVariances(14 / 3, 5, 3, 8 / 3, 4, 3); // => 47 / 12
1951 */
1952function combineVariances(variance1, mean1, n1, variance2, mean2, n2) {
1953 var newMean = combineMeans(mean1, n1, mean2, n2);
1954
1955 return (
1956 (n1 * (variance1 + Math.pow(mean1 - newMean, 2)) +
1957 n2 * (variance2 + Math.pow(mean2 - newMean, 2))) /
1958 (n1 + n2)
1959 );
1960}
1961
1962/**
1963 * The [Geometric Mean](https://en.wikipedia.org/wiki/Geometric_mean) is
1964 * a mean function that is more useful for numbers in different
1965 * ranges.
1966 *
1967 * This is the nth root of the input numbers multiplied by each other.
1968 *
1969 * The geometric mean is often useful for
1970 * **[proportional growth](https://en.wikipedia.org/wiki/Geometric_mean#Proportional_growth)**: given
1971 * growth rates for multiple years, like _80%, 16.66% and 42.85%_, a simple
1972 * mean will incorrectly estimate an average growth rate, whereas a geometric
1973 * mean will correctly estimate a growth rate that, over those years,
1974 * will yield the same end value.
1975 *
1976 * This runs in `O(n)`, linear time, with respect to the length of the array.
1977 *
1978 * @param {Array<number>} x sample of one or more data points
1979 * @returns {number} geometric mean
1980 * @throws {Error} if x is empty
1981 * @throws {Error} if x contains a negative number
1982 * @example
1983 * var growthRates = [1.80, 1.166666, 1.428571];
1984 * var averageGrowth = ss.geometricMean(growthRates);
1985 * var averageGrowthRates = [averageGrowth, averageGrowth, averageGrowth];
1986 * var startingValue = 10;
1987 * var startingValueMean = 10;
1988 * growthRates.forEach(function(rate) {
1989 * startingValue *= rate;
1990 * });
1991 * averageGrowthRates.forEach(function(rate) {
1992 * startingValueMean *= rate;
1993 * });
1994 * startingValueMean === startingValue;
1995 */
1996function geometricMean(x) {
1997 if (x.length === 0) {
1998 throw new Error("geometricMean requires at least one data point");
1999 }
2000
2001 // the starting value.
2002 var value = 1;
2003
2004 for (var i = 0; i < x.length; i++) {
2005 // the geometric mean is only valid for positive numbers
2006 if (x[i] <= 0) {
2007 throw new Error(
2008 "geometricMean requires only positive numbers as input"
2009 );
2010 }
2011
2012 // repeatedly multiply the value by each number
2013 value *= x[i];
2014 }
2015
2016 return Math.pow(value, 1 / x.length);
2017}
2018
2019/**
2020 * The [Harmonic Mean](https://en.wikipedia.org/wiki/Harmonic_mean) is
2021 * a mean function typically used to find the average of rates.
2022 * This mean is calculated by taking the reciprocal of the arithmetic mean
2023 * of the reciprocals of the input numbers.
2024 *
2025 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
2026 * a method of finding a typical or central value of a set of numbers.
2027 *
2028 * This runs in `O(n)`, linear time, with respect to the length of the array.
2029 *
2030 * @param {Array<number>} x sample of one or more data points
2031 * @returns {number} harmonic mean
2032 * @throws {Error} if x is empty
2033 * @throws {Error} if x contains a negative number
2034 * @example
2035 * harmonicMean([2, 3]).toFixed(2) // => '2.40'
2036 */
2037function harmonicMean(x) {
2038 if (x.length === 0) {
2039 throw new Error("harmonicMean requires at least one data point");
2040 }
2041
2042 var reciprocalSum = 0;
2043
2044 for (var i = 0; i < x.length; i++) {
2045 // the harmonic mean is only valid for positive numbers
2046 if (x[i] <= 0) {
2047 throw new Error(
2048 "harmonicMean requires only positive numbers as input"
2049 );
2050 }
2051
2052 reciprocalSum += 1 / x[i];
2053 }
2054
2055 // divide n by the the reciprocal sum
2056 return x.length / reciprocalSum;
2057}
2058
2059/**
2060 * The mean, _also known as average_,
2061 * is the sum of all values over the number of values.
2062 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
2063 * a method of finding a typical or central value of a set of numbers.
2064 *
2065 * The simple mean uses the successive addition method internally
2066 * to calculate it's result. Errors in floating-point addition are
2067 * not accounted for, so if precision is required, the standard {@link mean}
2068 * method should be used instead.
2069 *
2070 * This runs in `O(n)`, linear time, with respect to the length of the array.
2071 *
2072 *
2073 * @param {Array<number>} x sample of one or more data points
2074 * @throws {Error} if the the length of x is less than one
2075 * @returns {number} mean
2076 * @example
2077 * mean([0, 10]); // => 5
2078 */
2079function meanSimple(x) {
2080 if (x.length === 0) {
2081 throw new Error("meanSimple requires at least one data point");
2082 }
2083
2084 return sumSimple(x) / x.length;
2085}
2086
2087/**
2088 * The [median](http://en.wikipedia.org/wiki/Median) is
2089 * the middle number of a list. This is often a good indicator of 'the middle'
2090 * when there are outliers that skew the `mean()` value.
2091 * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
2092 * a method of finding a typical or central value of a set of numbers.
2093 *
2094 * The median isn't necessarily one of the elements in the list: the value
2095 * can be the average of two elements if the list has an even length
2096 * and the two central values are different.
2097 *
2098 * @param {Array<number>} sorted input
2099 * @returns {number} median value
2100 * @example
2101 * medianSorted([10, 2, 5, 100, 2, 1]); // => 52.5
2102 */
2103function medianSorted(sorted) {
2104 return quantileSorted(sorted, 0.5);
2105}
2106
2107/**
2108 * When removing a value from a list, one does not have to necessary
2109 * recompute the mean of the list in linear time. They can instead use
2110 * this function to compute the new mean by providing the current mean,
2111 * the number of elements in the list that produced it and the value to remove.
2112 *
2113 * @since 3.0.0
2114 * @param {number} mean current mean
2115 * @param {number} n number of items in the list
2116 * @param {number} value the value to remove
2117 * @returns {number} the new mean
2118 *
2119 * @example
2120 * subtractFromMean(20.5, 6, 53); // => 14
2121 */
2122function subtractFromMean(mean, n, value) {
2123 return (mean * n - value) / (n - 1);
2124}
2125
2126/**
2127 * The Root Mean Square (RMS) is
2128 * a mean function used as a measure of the magnitude of a set
2129 * of numbers, regardless of their sign.
2130 * This is the square root of the mean of the squares of the
2131 * input numbers.
2132 * This runs in `O(n)`, linear time, with respect to the length of the array.
2133 *
2134 * @param {Array<number>} x a sample of one or more data points
2135 * @returns {number} root mean square
2136 * @throws {Error} if x is empty
2137 * @example
2138 * rootMeanSquare([-1, 1, -1, 1]); // => 1
2139 */
2140function rootMeanSquare(x) {
2141 if (x.length === 0) {
2142 throw new Error("rootMeanSquare requires at least one data point");
2143 }
2144
2145 var sumOfSquares = 0;
2146 for (var i = 0; i < x.length; i++) {
2147 sumOfSquares += Math.pow(x[i], 2);
2148 }
2149
2150 return Math.sqrt(sumOfSquares / x.length);
2151}
2152
2153/**
2154 * This is to compute [a one-sample t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#One-sample_t-test), comparing the mean
2155 * of a sample to a known value, x.
2156 *
2157 * in this case, we're trying to determine whether the
2158 * population mean is equal to the value that we know, which is `x`
2159 * here. usually the results here are used to look up a
2160 * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
2161 * a certain level of significance, will let you determine that the
2162 * null hypothesis can or cannot be rejected.
2163 *
2164 * @param {Array<number>} x sample of one or more numbers
2165 * @param {number} expectedValue expected value of the population mean
2166 * @returns {number} value
2167 * @example
2168 * tTest([1, 2, 3, 4, 5, 6], 3.385).toFixed(2); // => '0.16'
2169 */
2170function tTest(x, expectedValue) {
2171 // The mean of the sample
2172 var sampleMean = mean(x);
2173
2174 // The standard deviation of the sample
2175 var sd = standardDeviation(x);
2176
2177 // Square root the length of the sample
2178 var rootN = Math.sqrt(x.length);
2179
2180 // returning the t value
2181 return (sampleMean - expectedValue) / (sd / rootN);
2182}
2183
2184/**
2185 * This is to compute [two sample t-test](http://en.wikipedia.org/wiki/Student's_t-test).
2186 * Tests whether "mean(X)-mean(Y) = difference", (
2187 * in the most common case, we often have `difference == 0` to test if two samples
2188 * are likely to be taken from populations with the same mean value) with
2189 * no prior knowledge on standard deviations of both samples
2190 * other than the fact that they have the same standard deviation.
2191 *
2192 * Usually the results here are used to look up a
2193 * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
2194 * a certain level of significance, will let you determine that the
2195 * null hypothesis can or cannot be rejected.
2196 *
2197 * `diff` can be omitted if it equals 0.
2198 *
2199 * [This is used to confirm or deny](http://www.monarchlab.org/Lab/Research/Stats/2SampleT.aspx)
2200 * a null hypothesis that the two populations that have been sampled into
2201 * `sampleX` and `sampleY` are equal to each other.
2202 *
2203 * @param {Array<number>} sampleX a sample as an array of numbers
2204 * @param {Array<number>} sampleY a sample as an array of numbers
2205 * @param {number} [difference=0]
2206 * @returns {number|null} test result
2207 *
2208 * @example
2209 * tTestTwoSample([1, 2, 3, 4], [3, 4, 5, 6], 0); // => -2.1908902300206643
2210 */
2211function tTestTwoSample(sampleX, sampleY, difference) {
2212 var n = sampleX.length;
2213 var m = sampleY.length;
2214
2215 // If either sample doesn't actually have any values, we can't
2216 // compute this at all, so we return `null`.
2217 if (!n || !m) {
2218 return null;
2219 }
2220
2221 // default difference (mu) is zero
2222 if (!difference) {
2223 difference = 0;
2224 }
2225
2226 var meanX = mean(sampleX);
2227 var meanY = mean(sampleY);
2228 var sampleVarianceX = sampleVariance(sampleX);
2229 var sampleVarianceY = sampleVariance(sampleY);
2230
2231 if (
2232 typeof meanX === "number" &&
2233 typeof meanY === "number" &&
2234 typeof sampleVarianceX === "number" &&
2235 typeof sampleVarianceY === "number"
2236 ) {
2237 var weightedVariance =
2238 ((n - 1) * sampleVarianceX + (m - 1) * sampleVarianceY) /
2239 (n + m - 2);
2240
2241 return (
2242 (meanX - meanY - difference) /
2243 Math.sqrt(weightedVariance * (1 / n + 1 / m))
2244 );
2245 }
2246}
2247
2248/**
2249 * [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
2250 *
2251 * This is a naïve bayesian classifier that takes
2252 * singly-nested objects.
2253 *
2254 * @class
2255 * @example
2256 * var bayes = new BayesianClassifier();
2257 * bayes.train({
2258 * species: 'Cat'
2259 * }, 'animal');
2260 * var result = bayes.score({
2261 * species: 'Cat'
2262 * })
2263 * // result
2264 * // {
2265 * // animal: 1
2266 * // }
2267 */
2268var BayesianClassifier = function BayesianClassifier() {
2269 // The number of items that are currently
2270 // classified in the model
2271 this.totalCount = 0;
2272 // Every item classified in the model
2273 this.data = {};
2274};
2275
2276/**
2277 * Train the classifier with a new item, which has a single
2278 * dimension of Javascript literal keys and values.
2279 *
2280 * @param {Object} item an object with singly-deep properties
2281 * @param {string} category the category this item belongs to
2282 * @return {undefined} adds the item to the classifier
2283 */
2284BayesianClassifier.prototype.train = function train (item, category) {
2285 // If the data object doesn't have any values
2286 // for this category, create a new object for it.
2287 if (!this.data[category]) {
2288 this.data[category] = {};
2289 }
2290
2291 // Iterate through each key in the item.
2292 for (var k in item) {
2293 var v = item[k];
2294 // Initialize the nested object `data[category][k][item[k]]`
2295 // with an object of keys that equal 0.
2296 if (this.data[category][k] === undefined) {
2297 this.data[category][k] = {};
2298 }
2299 if (this.data[category][k][v] === undefined) {
2300 this.data[category][k][v] = 0;
2301 }
2302
2303 // And increment the key for this key/value combination.
2304 this.data[category][k][v]++;
2305 }
2306
2307 // Increment the number of items classified
2308 this.totalCount++;
2309};
2310
2311/**
2312 * Generate a score of how well this item matches all
2313 * possible categories based on its attributes
2314 *
2315 * @param {Object} item an item in the same format as with train
2316 * @returns {Object} of probabilities that this item belongs to a
2317 * given category.
2318 */
2319BayesianClassifier.prototype.score = function score (item) {
2320 // Initialize an empty array of odds per category.
2321 var odds = {};
2322 var category;
2323 // Iterate through each key in the item,
2324 // then iterate through each category that has been used
2325 // in previous calls to `.train()`
2326 for (var k in item) {
2327 var v = item[k];
2328 for (category in this.data) {
2329 // Create an empty object for storing key - value combinations
2330 // for this category.
2331 odds[category] = {};
2332
2333 // If this item doesn't even have a property, it counts for nothing,
2334 // but if it does have the property that we're looking for from
2335 // the item to categorize, it counts based on how popular it is
2336 // versus the whole population.
2337 if (this.data[category][k]) {
2338 odds[category][k + "_" + v] =
2339 (this.data[category][k][v] || 0) / this.totalCount;
2340 } else {
2341 odds[category][k + "_" + v] = 0;
2342 }
2343 }
2344 }
2345
2346 // Set up a new object that will contain sums of these odds by category
2347 var oddsSums = {};
2348
2349 for (category in odds) {
2350 // Tally all of the odds for each category-combination pair -
2351 // the non-existence of a category does not add anything to the
2352 // score.
2353 oddsSums[category] = 0;
2354 for (var combination in odds[category]) {
2355 oddsSums[category] += odds[category][combination];
2356 }
2357 }
2358
2359 return oddsSums;
2360};
2361
2362/**
2363 * This is a single-layer [Perceptron Classifier](http://en.wikipedia.org/wiki/Perceptron) that takes
2364 * arrays of numbers and predicts whether they should be classified
2365 * as either 0 or 1 (negative or positive examples).
2366 * @class
2367 * @example
2368 * // Create the model
2369 * var p = new PerceptronModel();
2370 * // Train the model with input with a diagonal boundary.
2371 * for (var i = 0; i < 5; i++) {
2372 * p.train([1, 1], 1);
2373 * p.train([0, 1], 0);
2374 * p.train([1, 0], 0);
2375 * p.train([0, 0], 0);
2376 * }
2377 * p.predict([0, 0]); // 0
2378 * p.predict([0, 1]); // 0
2379 * p.predict([1, 0]); // 0
2380 * p.predict([1, 1]); // 1
2381 */
2382var PerceptronModel = function PerceptronModel() {
2383 // The weights, or coefficients of the model;
2384 // weights are only populated when training with data.
2385 this.weights = [];
2386 // The bias term, or intercept; it is also a weight but
2387 // it's stored separately for convenience as it is always
2388 // multiplied by one.
2389 this.bias = 0;
2390};
2391/**
2392 * **Predict**: Use an array of features with the weight array and bias
2393 * to predict whether an example is labeled 0 or 1.
2394 *
2395 * @param {Array<number>} features an array of features as numbers
2396 * @returns {number} 1 if the score is over 0, otherwise 0
2397 */
2398PerceptronModel.prototype.predict = function predict (features) {
2399 // Only predict if previously trained
2400 // on the same size feature array(s).
2401 if (features.length !== this.weights.length) {
2402 return null;
2403 }
2404
2405 // Calculate the sum of features times weights,
2406 // with the bias added (implicitly times one).
2407 var score = 0;
2408 for (var i = 0; i < this.weights.length; i++) {
2409 score += this.weights[i] * features[i];
2410 }
2411 score += this.bias;
2412
2413 // Classify as 1 if the score is over 0, otherwise 0.
2414 if (score > 0) {
2415 return 1;
2416 } else {
2417 return 0;
2418 }
2419};
2420
2421/**
2422 * **Train** the classifier with a new example, which is
2423 * a numeric array of features and a 0 or 1 label.
2424 *
2425 * @param {Array<number>} features an array of features as numbers
2426 * @param {number} label either 0 or 1
2427 * @returns {PerceptronModel} this
2428 */
2429PerceptronModel.prototype.train = function train (features, label) {
2430 // Require that only labels of 0 or 1 are considered.
2431 if (label !== 0 && label !== 1) {
2432 return null;
2433 }
2434 // The length of the feature array determines
2435 // the length of the weight array.
2436 // The perceptron will continue learning as long as
2437 // it keeps seeing feature arrays of the same length.
2438 // When it sees a new data shape, it initializes.
2439 if (features.length !== this.weights.length) {
2440 this.weights = features;
2441 this.bias = 1;
2442 }
2443 // Make a prediction based on current weights.
2444 var prediction = this.predict(features);
2445 // Update the weights if the prediction is wrong.
2446 if (typeof prediction === "number" && prediction !== label) {
2447 var gradient = label - prediction;
2448 for (var i = 0; i < this.weights.length; i++) {
2449 this.weights[i] += gradient * features[i];
2450 }
2451 this.bias += gradient;
2452 }
2453 return this;
2454};
2455
2456/**
2457 * We use `ε`, epsilon, as a stopping criterion when we want to iterate
2458 * until we're "close enough". Epsilon is a very small number: for
2459 * simple statistics, that number is **0.0001**
2460 *
2461 * This is used in calculations like the binomialDistribution, in which
2462 * the process of finding a value is [iterative](https://en.wikipedia.org/wiki/Iterative_method):
2463 * it progresses until it is close enough.
2464 *
2465 * Below is an example of using epsilon in [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent),
2466 * where we're trying to find a local minimum of a function's derivative,
2467 * given by the `fDerivative` method.
2468 *
2469 * @example
2470 * // From calculation, we expect that the local minimum occurs at x=9/4
2471 * var x_old = 0;
2472 * // The algorithm starts at x=6
2473 * var x_new = 6;
2474 * var stepSize = 0.01;
2475 *
2476 * function fDerivative(x) {
2477 * return 4 * Math.pow(x, 3) - 9 * Math.pow(x, 2);
2478 * }
2479 *
2480 * // The loop runs until the difference between the previous
2481 * // value and the current value is smaller than epsilon - a rough
2482 * // meaure of 'close enough'
2483 * while (Math.abs(x_new - x_old) > ss.epsilon) {
2484 * x_old = x_new;
2485 * x_new = x_old - stepSize * fDerivative(x_old);
2486 * }
2487 *
2488 * console.log('Local minimum occurs at', x_new);
2489 */
2490var epsilon = 0.0001;
2491
2492/**
2493 * A [Factorial](https://en.wikipedia.org/wiki/Factorial), usually written n!, is the product of all positive
2494 * integers less than or equal to n. Often factorial is implemented
2495 * recursively, but this iterative approach is significantly faster
2496 * and simpler.
2497 *
2498 * @param {number} n input, must be an integer number 1 or greater
2499 * @returns {number} factorial: n!
2500 * @throws {Error} if n is less than 0 or not an integer
2501 * @example
2502 * factorial(5); // => 120
2503 */
2504function factorial(n) {
2505 // factorial is mathematically undefined for negative numbers
2506 if (n < 0) {
2507 throw new Error("factorial requires a non-negative value");
2508 }
2509
2510 if (Math.floor(n) !== n) {
2511 throw new Error("factorial requires an integer input");
2512 }
2513
2514 // typically you'll expand the factorial function going down, like
2515 // 5! = 5 * 4 * 3 * 2 * 1. This is going in the opposite direction,
2516 // counting from 2 up to the number in question, and since anything
2517 // multiplied by 1 is itself, the loop only needs to start at 2.
2518 var accumulator = 1;
2519 for (var i = 2; i <= n; i++) {
2520 // for each number up to and including the number `n`, multiply
2521 // the accumulator my that number.
2522 accumulator *= i;
2523 }
2524 return accumulator;
2525}
2526
2527/**
2528 * Compute the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Nemes' approximation.
2529 * The gamma of n is equivalent to (n-1)!, but unlike the factorial function, gamma is defined for all real n except zero
2530 * and negative integers (where NaN is returned). Note, the gamma function is also well-defined for complex numbers,
2531 * though this implementation currently does not handle complex numbers as input values.
2532 * Nemes' approximation is defined [here](https://arxiv.org/abs/1003.6020) as Theorem 2.2.
2533 * Negative values use [Euler's reflection formula](https://en.wikipedia.org/wiki/Gamma_function#Properties) for computation.
2534 *
2535 * @param {number} n Any real number except for zero and negative integers.
2536 * @returns {number} The gamma of the input value.
2537 *
2538 * @example
2539 * gamma(11.5); // 11899423.084037038
2540 * gamma(-11.5); // 2.29575810481609e-8
2541 * gamma(5); // 24
2542 */
2543function gamma(n) {
2544 if (Number.isInteger(n)) {
2545 if (n <= 0) {
2546 // gamma not defined for zero or negative integers
2547 return NaN;
2548 } else {
2549 // use factorial for integer inputs
2550 return factorial(n - 1);
2551 }
2552 }
2553
2554 // Decrement n, because approximation is defined for n - 1
2555 n--;
2556
2557 if (n < 0) {
2558 // Use Euler's reflection formula for negative inputs
2559 // see: https://en.wikipedia.org/wiki/Gamma_function#Properties
2560 return Math.PI / (Math.sin(Math.PI * -n) * gamma(-n));
2561 } else {
2562 // Nemes' expansion approximation
2563 var seriesCoefficient =
2564 Math.pow(n / Math.E, n) * Math.sqrt(2 * Math.PI * (n + 1 / 6));
2565
2566 var seriesDenom = n + 1 / 4;
2567
2568 var seriesExpansion =
2569 1 +
2570 1 / 144 / Math.pow(seriesDenom, 2) -
2571 1 / 12960 / Math.pow(seriesDenom, 3) -
2572 257 / 207360 / Math.pow(seriesDenom, 4) -
2573 52 / 2612736 / Math.pow(seriesDenom, 5) +
2574 5741173 / 9405849600 / Math.pow(seriesDenom, 6) +
2575 37529 / 18811699200 / Math.pow(seriesDenom, 7);
2576
2577 return seriesCoefficient * seriesExpansion;
2578 }
2579}
2580
2581// Define series coefficients
2582var COEFFICIENTS = [
2583 0.99999999999999709182,
2584 57.156235665862923517,
2585 -59.597960355475491248,
2586 14.136097974741747174,
2587 -0.49191381609762019978,
2588 0.33994649984811888699e-4,
2589 0.46523628927048575665e-4,
2590 -0.98374475304879564677e-4,
2591 0.15808870322491248884e-3,
2592 -0.21026444172410488319e-3,
2593 0.2174396181152126432e-3,
2594 -0.16431810653676389022e-3,
2595 0.84418223983852743293e-4,
2596 -0.2619083840158140867e-4,
2597 0.36899182659531622704e-5
2598];
2599
2600var g = 607 / 128;
2601var LOGSQRT2PI = Math.log(Math.sqrt(2 * Math.PI));
2602
2603/**
2604 * Compute the logarithm of the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Lanczos' approximation.
2605 * This function takes as input any real-value n greater than 0.
2606 * This function is useful for values of n too large for the normal gamma function (n > 165).
2607 * The code is based on Lanczo's Gamma approximation, defined [here](http://my.fit.edu/~gabdo/gamma.txt).
2608 *
2609 * @param {number} n Any real number greater than zero.
2610 * @returns {number} The logarithm of gamma of the input value.
2611 *
2612 * @example
2613 * gammaln(500); // 2605.1158503617335
2614 * gammaln(2.4); // 0.21685932244884043
2615 */
2616function gammaln(n) {
2617 // Return infinity if value not in domain
2618 if (n <= 0) {
2619 return Infinity;
2620 }
2621
2622 // Decrement n, because approximation is defined for n - 1
2623 n--;
2624
2625 // Create series approximation
2626 var a = COEFFICIENTS[0];
2627
2628 for (var i = 1; i < 15; i++) {
2629 a += COEFFICIENTS[i] / (n + i);
2630 }
2631
2632 var tmp = g + 0.5 + n;
2633
2634 // Return natural logarithm of gamma(n)
2635 return LOGSQRT2PI + Math.log(a) - tmp + (n + 0.5) * Math.log(tmp);
2636}
2637
2638/**
2639 * The [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution)
2640 * is the probability discrete
2641 * distribution of a random variable which takes value 1 with success
2642 * probability `p` and value 0 with failure
2643 * probability `q` = 1 - `p`. It can be used, for example, to represent the
2644 * toss of a coin, where "1" is defined to mean "heads" and "0" is defined
2645 * to mean "tails" (or vice versa). It is
2646 * a special case of a Binomial Distribution
2647 * where `n` = 1.
2648 *
2649 * @param {number} p input value, between 0 and 1 inclusive
2650 * @returns {number[]} values of bernoulli distribution at this point
2651 * @throws {Error} if p is outside 0 and 1
2652 * @example
2653 * bernoulliDistribution(0.3); // => [0.7, 0.3]
2654 */
2655function bernoulliDistribution(p) /*: number[] */ {
2656 // Check that `p` is a valid probability (0 ≤ p ≤ 1)
2657 if (p < 0 || p > 1) {
2658 throw new Error(
2659 "bernoulliDistribution requires probability to be between 0 and 1 inclusive"
2660 );
2661 }
2662
2663 return [1 - p, p];
2664}
2665
2666/**
2667 * The [Binomial Distribution](http://en.wikipedia.org/wiki/Binomial_distribution) is the discrete probability
2668 * distribution of the number of successes in a sequence of n independent yes/no experiments, each of which yields
2669 * success with probability `probability`. Such a success/failure experiment is also called a Bernoulli experiment or
2670 * Bernoulli trial; when trials = 1, the Binomial Distribution is a Bernoulli Distribution.
2671 *
2672 * @param {number} trials number of trials to simulate
2673 * @param {number} probability
2674 * @returns {number[]} output
2675 */
2676function binomialDistribution(trials, probability) /*: ?number[] */ {
2677 // Check that `p` is a valid probability (0 ≤ p ≤ 1),
2678 // that `n` is an integer, strictly positive.
2679 if (probability < 0 || probability > 1 || trials <= 0 || trials % 1 !== 0) {
2680 return undefined;
2681 }
2682
2683 // We initialize `x`, the random variable, and `accumulator`, an accumulator
2684 // for the cumulative distribution function to 0. `distribution_functions`
2685 // is the object we'll return with the `probability_of_x` and the
2686 // `cumulativeProbability_of_x`, as well as the calculated mean &
2687 // variance. We iterate until the `cumulativeProbability_of_x` is
2688 // within `epsilon` of 1.0.
2689 var x = 0;
2690 var cumulativeProbability = 0;
2691 var cells = [];
2692 var binomialCoefficient = 1;
2693
2694 // This algorithm iterates through each potential outcome,
2695 // until the `cumulativeProbability` is very close to 1, at
2696 // which point we've defined the vast majority of outcomes
2697 do {
2698 // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
2699 cells[x] =
2700 binomialCoefficient *
2701 Math.pow(probability, x) *
2702 Math.pow(1 - probability, trials - x);
2703 cumulativeProbability += cells[x];
2704 x++;
2705 binomialCoefficient = (binomialCoefficient * (trials - x + 1)) / x;
2706 // when the cumulativeProbability is nearly 1, we've calculated
2707 // the useful range of this distribution
2708 } while (cumulativeProbability < 1 - epsilon);
2709
2710 return cells;
2711}
2712
2713/**
2714 * The [Poisson Distribution](http://en.wikipedia.org/wiki/Poisson_distribution)
2715 * is a discrete probability distribution that expresses the probability
2716 * of a given number of events occurring in a fixed interval of time
2717 * and/or space if these events occur with a known average rate and
2718 * independently of the time since the last event.
2719 *
2720 * The Poisson Distribution is characterized by the strictly positive
2721 * mean arrival or occurrence rate, `λ`.
2722 *
2723 * @param {number} lambda location poisson distribution
2724 * @returns {number[]} values of poisson distribution at that point
2725 */
2726function poissonDistribution(lambda) /*: ?number[] */ {
2727 // Check that lambda is strictly positive
2728 if (lambda <= 0) {
2729 return undefined;
2730 }
2731
2732 // our current place in the distribution
2733 var x = 0;
2734 // and we keep track of the current cumulative probability, in
2735 // order to know when to stop calculating chances.
2736 var cumulativeProbability = 0;
2737 // the calculated cells to be returned
2738 var cells = [];
2739 var factorialX = 1;
2740
2741 // This algorithm iterates through each potential outcome,
2742 // until the `cumulativeProbability` is very close to 1, at
2743 // which point we've defined the vast majority of outcomes
2744 do {
2745 // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
2746 cells[x] = (Math.exp(-lambda) * Math.pow(lambda, x)) / factorialX;
2747 cumulativeProbability += cells[x];
2748 x++;
2749 factorialX *= x;
2750 // when the cumulativeProbability is nearly 1, we've calculated
2751 // the useful range of this distribution
2752 } while (cumulativeProbability < 1 - epsilon);
2753
2754 return cells;
2755}
2756
2757/**
2758 * **Percentage Points of the χ2 (Chi-Squared) Distribution**
2759 *
2760 * The [χ2 (Chi-Squared) Distribution](http://en.wikipedia.org/wiki/Chi-squared_distribution) is used in the common
2761 * chi-squared tests for goodness of fit of an observed distribution to a theoretical one, the independence of two
2762 * criteria of classification of qualitative data, and in confidence interval estimation for a population standard
2763 * deviation of a normal distribution from a sample standard deviation.
2764 *
2765 * Values from Appendix 1, Table III of William W. Hines & Douglas C. Montgomery, "Probability and Statistics in
2766 * Engineering and Management Science", Wiley (1980).
2767 */
2768var chiSquaredDistributionTable = {
2769 1: {
2770 0.995: 0,
2771 0.99: 0,
2772 0.975: 0,
2773 0.95: 0,
2774 0.9: 0.02,
2775 0.5: 0.45,
2776 0.1: 2.71,
2777 0.05: 3.84,
2778 0.025: 5.02,
2779 0.01: 6.63,
2780 0.005: 7.88
2781 },
2782 2: {
2783 0.995: 0.01,
2784 0.99: 0.02,
2785 0.975: 0.05,
2786 0.95: 0.1,
2787 0.9: 0.21,
2788 0.5: 1.39,
2789 0.1: 4.61,
2790 0.05: 5.99,
2791 0.025: 7.38,
2792 0.01: 9.21,
2793 0.005: 10.6
2794 },
2795 3: {
2796 0.995: 0.07,
2797 0.99: 0.11,
2798 0.975: 0.22,
2799 0.95: 0.35,
2800 0.9: 0.58,
2801 0.5: 2.37,
2802 0.1: 6.25,
2803 0.05: 7.81,
2804 0.025: 9.35,
2805 0.01: 11.34,
2806 0.005: 12.84
2807 },
2808 4: {
2809 0.995: 0.21,
2810 0.99: 0.3,
2811 0.975: 0.48,
2812 0.95: 0.71,
2813 0.9: 1.06,
2814 0.5: 3.36,
2815 0.1: 7.78,
2816 0.05: 9.49,
2817 0.025: 11.14,
2818 0.01: 13.28,
2819 0.005: 14.86
2820 },
2821 5: {
2822 0.995: 0.41,
2823 0.99: 0.55,
2824 0.975: 0.83,
2825 0.95: 1.15,
2826 0.9: 1.61,
2827 0.5: 4.35,
2828 0.1: 9.24,
2829 0.05: 11.07,
2830 0.025: 12.83,
2831 0.01: 15.09,
2832 0.005: 16.75
2833 },
2834 6: {
2835 0.995: 0.68,
2836 0.99: 0.87,
2837 0.975: 1.24,
2838 0.95: 1.64,
2839 0.9: 2.2,
2840 0.5: 5.35,
2841 0.1: 10.65,
2842 0.05: 12.59,
2843 0.025: 14.45,
2844 0.01: 16.81,
2845 0.005: 18.55
2846 },
2847 7: {
2848 0.995: 0.99,
2849 0.99: 1.25,
2850 0.975: 1.69,
2851 0.95: 2.17,
2852 0.9: 2.83,
2853 0.5: 6.35,
2854 0.1: 12.02,
2855 0.05: 14.07,
2856 0.025: 16.01,
2857 0.01: 18.48,
2858 0.005: 20.28
2859 },
2860 8: {
2861 0.995: 1.34,
2862 0.99: 1.65,
2863 0.975: 2.18,
2864 0.95: 2.73,
2865 0.9: 3.49,
2866 0.5: 7.34,
2867 0.1: 13.36,
2868 0.05: 15.51,
2869 0.025: 17.53,
2870 0.01: 20.09,
2871 0.005: 21.96
2872 },
2873 9: {
2874 0.995: 1.73,
2875 0.99: 2.09,
2876 0.975: 2.7,
2877 0.95: 3.33,
2878 0.9: 4.17,
2879 0.5: 8.34,
2880 0.1: 14.68,
2881 0.05: 16.92,
2882 0.025: 19.02,
2883 0.01: 21.67,
2884 0.005: 23.59
2885 },
2886 10: {
2887 0.995: 2.16,
2888 0.99: 2.56,
2889 0.975: 3.25,
2890 0.95: 3.94,
2891 0.9: 4.87,
2892 0.5: 9.34,
2893 0.1: 15.99,
2894 0.05: 18.31,
2895 0.025: 20.48,
2896 0.01: 23.21,
2897 0.005: 25.19
2898 },
2899 11: {
2900 0.995: 2.6,
2901 0.99: 3.05,
2902 0.975: 3.82,
2903 0.95: 4.57,
2904 0.9: 5.58,
2905 0.5: 10.34,
2906 0.1: 17.28,
2907 0.05: 19.68,
2908 0.025: 21.92,
2909 0.01: 24.72,
2910 0.005: 26.76
2911 },
2912 12: {
2913 0.995: 3.07,
2914 0.99: 3.57,
2915 0.975: 4.4,
2916 0.95: 5.23,
2917 0.9: 6.3,
2918 0.5: 11.34,
2919 0.1: 18.55,
2920 0.05: 21.03,
2921 0.025: 23.34,
2922 0.01: 26.22,
2923 0.005: 28.3
2924 },
2925 13: {
2926 0.995: 3.57,
2927 0.99: 4.11,
2928 0.975: 5.01,
2929 0.95: 5.89,
2930 0.9: 7.04,
2931 0.5: 12.34,
2932 0.1: 19.81,
2933 0.05: 22.36,
2934 0.025: 24.74,
2935 0.01: 27.69,
2936 0.005: 29.82
2937 },
2938 14: {
2939 0.995: 4.07,
2940 0.99: 4.66,
2941 0.975: 5.63,
2942 0.95: 6.57,
2943 0.9: 7.79,
2944 0.5: 13.34,
2945 0.1: 21.06,
2946 0.05: 23.68,
2947 0.025: 26.12,
2948 0.01: 29.14,
2949 0.005: 31.32
2950 },
2951 15: {
2952 0.995: 4.6,
2953 0.99: 5.23,
2954 0.975: 6.27,
2955 0.95: 7.26,
2956 0.9: 8.55,
2957 0.5: 14.34,
2958 0.1: 22.31,
2959 0.05: 25,
2960 0.025: 27.49,
2961 0.01: 30.58,
2962 0.005: 32.8
2963 },
2964 16: {
2965 0.995: 5.14,
2966 0.99: 5.81,
2967 0.975: 6.91,
2968 0.95: 7.96,
2969 0.9: 9.31,
2970 0.5: 15.34,
2971 0.1: 23.54,
2972 0.05: 26.3,
2973 0.025: 28.85,
2974 0.01: 32,
2975 0.005: 34.27
2976 },
2977 17: {
2978 0.995: 5.7,
2979 0.99: 6.41,
2980 0.975: 7.56,
2981 0.95: 8.67,
2982 0.9: 10.09,
2983 0.5: 16.34,
2984 0.1: 24.77,
2985 0.05: 27.59,
2986 0.025: 30.19,
2987 0.01: 33.41,
2988 0.005: 35.72
2989 },
2990 18: {
2991 0.995: 6.26,
2992 0.99: 7.01,
2993 0.975: 8.23,
2994 0.95: 9.39,
2995 0.9: 10.87,
2996 0.5: 17.34,
2997 0.1: 25.99,
2998 0.05: 28.87,
2999 0.025: 31.53,
3000 0.01: 34.81,
3001 0.005: 37.16
3002 },
3003 19: {
3004 0.995: 6.84,
3005 0.99: 7.63,
3006 0.975: 8.91,
3007 0.95: 10.12,
3008 0.9: 11.65,
3009 0.5: 18.34,
3010 0.1: 27.2,
3011 0.05: 30.14,
3012 0.025: 32.85,
3013 0.01: 36.19,
3014 0.005: 38.58
3015 },
3016 20: {
3017 0.995: 7.43,
3018 0.99: 8.26,
3019 0.975: 9.59,
3020 0.95: 10.85,
3021 0.9: 12.44,
3022 0.5: 19.34,
3023 0.1: 28.41,
3024 0.05: 31.41,
3025 0.025: 34.17,
3026 0.01: 37.57,
3027 0.005: 40
3028 },
3029 21: {
3030 0.995: 8.03,
3031 0.99: 8.9,
3032 0.975: 10.28,
3033 0.95: 11.59,
3034 0.9: 13.24,
3035 0.5: 20.34,
3036 0.1: 29.62,
3037 0.05: 32.67,
3038 0.025: 35.48,
3039 0.01: 38.93,
3040 0.005: 41.4
3041 },
3042 22: {
3043 0.995: 8.64,
3044 0.99: 9.54,
3045 0.975: 10.98,
3046 0.95: 12.34,
3047 0.9: 14.04,
3048 0.5: 21.34,
3049 0.1: 30.81,
3050 0.05: 33.92,
3051 0.025: 36.78,
3052 0.01: 40.29,
3053 0.005: 42.8
3054 },
3055 23: {
3056 0.995: 9.26,
3057 0.99: 10.2,
3058 0.975: 11.69,
3059 0.95: 13.09,
3060 0.9: 14.85,
3061 0.5: 22.34,
3062 0.1: 32.01,
3063 0.05: 35.17,
3064 0.025: 38.08,
3065 0.01: 41.64,
3066 0.005: 44.18
3067 },
3068 24: {
3069 0.995: 9.89,
3070 0.99: 10.86,
3071 0.975: 12.4,
3072 0.95: 13.85,
3073 0.9: 15.66,
3074 0.5: 23.34,
3075 0.1: 33.2,
3076 0.05: 36.42,
3077 0.025: 39.36,
3078 0.01: 42.98,
3079 0.005: 45.56
3080 },
3081 25: {
3082 0.995: 10.52,
3083 0.99: 11.52,
3084 0.975: 13.12,
3085 0.95: 14.61,
3086 0.9: 16.47,
3087 0.5: 24.34,
3088 0.1: 34.28,
3089 0.05: 37.65,
3090 0.025: 40.65,
3091 0.01: 44.31,
3092 0.005: 46.93
3093 },
3094 26: {
3095 0.995: 11.16,
3096 0.99: 12.2,
3097 0.975: 13.84,
3098 0.95: 15.38,
3099 0.9: 17.29,
3100 0.5: 25.34,
3101 0.1: 35.56,
3102 0.05: 38.89,
3103 0.025: 41.92,
3104 0.01: 45.64,
3105 0.005: 48.29
3106 },
3107 27: {
3108 0.995: 11.81,
3109 0.99: 12.88,
3110 0.975: 14.57,
3111 0.95: 16.15,
3112 0.9: 18.11,
3113 0.5: 26.34,
3114 0.1: 36.74,
3115 0.05: 40.11,
3116 0.025: 43.19,
3117 0.01: 46.96,
3118 0.005: 49.65
3119 },
3120 28: {
3121 0.995: 12.46,
3122 0.99: 13.57,
3123 0.975: 15.31,
3124 0.95: 16.93,
3125 0.9: 18.94,
3126 0.5: 27.34,
3127 0.1: 37.92,
3128 0.05: 41.34,
3129 0.025: 44.46,
3130 0.01: 48.28,
3131 0.005: 50.99
3132 },
3133 29: {
3134 0.995: 13.12,
3135 0.99: 14.26,
3136 0.975: 16.05,
3137 0.95: 17.71,
3138 0.9: 19.77,
3139 0.5: 28.34,
3140 0.1: 39.09,
3141 0.05: 42.56,
3142 0.025: 45.72,
3143 0.01: 49.59,
3144 0.005: 52.34
3145 },
3146 30: {
3147 0.995: 13.79,
3148 0.99: 14.95,
3149 0.975: 16.79,
3150 0.95: 18.49,
3151 0.9: 20.6,
3152 0.5: 29.34,
3153 0.1: 40.26,
3154 0.05: 43.77,
3155 0.025: 46.98,
3156 0.01: 50.89,
3157 0.005: 53.67
3158 },
3159 40: {
3160 0.995: 20.71,
3161 0.99: 22.16,
3162 0.975: 24.43,
3163 0.95: 26.51,
3164 0.9: 29.05,
3165 0.5: 39.34,
3166 0.1: 51.81,
3167 0.05: 55.76,
3168 0.025: 59.34,
3169 0.01: 63.69,
3170 0.005: 66.77
3171 },
3172 50: {
3173 0.995: 27.99,
3174 0.99: 29.71,
3175 0.975: 32.36,
3176 0.95: 34.76,
3177 0.9: 37.69,
3178 0.5: 49.33,
3179 0.1: 63.17,
3180 0.05: 67.5,
3181 0.025: 71.42,
3182 0.01: 76.15,
3183 0.005: 79.49
3184 },
3185 60: {
3186 0.995: 35.53,
3187 0.99: 37.48,
3188 0.975: 40.48,
3189 0.95: 43.19,
3190 0.9: 46.46,
3191 0.5: 59.33,
3192 0.1: 74.4,
3193 0.05: 79.08,
3194 0.025: 83.3,
3195 0.01: 88.38,
3196 0.005: 91.95
3197 },
3198 70: {
3199 0.995: 43.28,
3200 0.99: 45.44,
3201 0.975: 48.76,
3202 0.95: 51.74,
3203 0.9: 55.33,
3204 0.5: 69.33,
3205 0.1: 85.53,
3206 0.05: 90.53,
3207 0.025: 95.02,
3208 0.01: 100.42,
3209 0.005: 104.22
3210 },
3211 80: {
3212 0.995: 51.17,
3213 0.99: 53.54,
3214 0.975: 57.15,
3215 0.95: 60.39,
3216 0.9: 64.28,
3217 0.5: 79.33,
3218 0.1: 96.58,
3219 0.05: 101.88,
3220 0.025: 106.63,
3221 0.01: 112.33,
3222 0.005: 116.32
3223 },
3224 90: {
3225 0.995: 59.2,
3226 0.99: 61.75,
3227 0.975: 65.65,
3228 0.95: 69.13,
3229 0.9: 73.29,
3230 0.5: 89.33,
3231 0.1: 107.57,
3232 0.05: 113.14,
3233 0.025: 118.14,
3234 0.01: 124.12,
3235 0.005: 128.3
3236 },
3237 100: {
3238 0.995: 67.33,
3239 0.99: 70.06,
3240 0.975: 74.22,
3241 0.95: 77.93,
3242 0.9: 82.36,
3243 0.5: 99.33,
3244 0.1: 118.5,
3245 0.05: 124.34,
3246 0.025: 129.56,
3247 0.01: 135.81,
3248 0.005: 140.17
3249 }
3250};
3251
3252/**
3253 * The [χ2 (Chi-Squared) Goodness-of-Fit Test](http://en.wikipedia.org/wiki/Goodness_of_fit#Pearson.27s_chi-squared_test)
3254 * uses a measure of goodness of fit which is the sum of differences between observed and expected outcome frequencies
3255 * (that is, counts of observations), each squared and divided by the number of observations expected given the
3256 * hypothesized distribution. The resulting χ2 statistic, `chiSquared`, can be compared to the chi-squared distribution
3257 * to determine the goodness of fit. In order to determine the degrees of freedom of the chi-squared distribution, one
3258 * takes the total number of observed frequencies and subtracts the number of estimated parameters. The test statistic
3259 * follows, approximately, a chi-square distribution with (k − c) degrees of freedom where `k` is the number of non-empty
3260 * cells and `c` is the number of estimated parameters for the distribution.
3261 *
3262 * @param {Array<number>} data
3263 * @param {Function} distributionType a function that returns a point in a distribution:
3264 * for instance, binomial, bernoulli, or poisson
3265 * @param {number} significance
3266 * @returns {number} chi squared goodness of fit
3267 * @example
3268 * // Data from Poisson goodness-of-fit example 10-19 in William W. Hines & Douglas C. Montgomery,
3269 * // "Probability and Statistics in Engineering and Management Science", Wiley (1980).
3270 * var data1019 = [
3271 * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3272 * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3273 * 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3274 * 2, 2, 2, 2, 2, 2, 2, 2, 2,
3275 * 3, 3, 3, 3
3276 * ];
3277 * ss.chiSquaredGoodnessOfFit(data1019, ss.poissonDistribution, 0.05); //= false
3278 */
3279function chiSquaredGoodnessOfFit(data, distributionType, significance) {
3280 // Estimate from the sample data, a weighted mean.
3281 var inputMean = mean(data);
3282 // Calculated value of the χ2 statistic.
3283 var chiSquared = 0;
3284 // Number of hypothesized distribution parameters estimated, expected to be supplied in the distribution test.
3285 // Lose one degree of freedom for estimating `lambda` from the sample data.
3286 var c = 1;
3287 // The hypothesized distribution.
3288 // Generate the hypothesized distribution.
3289 var hypothesizedDistribution = distributionType(inputMean);
3290 var observedFrequencies = [];
3291 var expectedFrequencies = [];
3292
3293 // Create an array holding a histogram from the sample data, of
3294 // the form `{ value: numberOfOcurrences }`
3295 for (var i = 0; i < data.length; i++) {
3296 if (observedFrequencies[data[i]] === undefined) {
3297 observedFrequencies[data[i]] = 0;
3298 }
3299 observedFrequencies[data[i]]++;
3300 }
3301
3302 // The histogram we created might be sparse - there might be gaps
3303 // between values. So we iterate through the histogram, making
3304 // sure that instead of undefined, gaps have 0 values.
3305 for (var i$1 = 0; i$1 < observedFrequencies.length; i$1++) {
3306 if (observedFrequencies[i$1] === undefined) {
3307 observedFrequencies[i$1] = 0;
3308 }
3309 }
3310
3311 // Create an array holding a histogram of expected data given the
3312 // sample size and hypothesized distribution.
3313 for (var k in hypothesizedDistribution) {
3314 if (k in observedFrequencies) {
3315 expectedFrequencies[+k] = hypothesizedDistribution[k] * data.length;
3316 }
3317 }
3318
3319 // Working backward through the expected frequencies, collapse classes
3320 // if less than three observations are expected for a class.
3321 // This transformation is applied to the observed frequencies as well.
3322 for (var k$1 = expectedFrequencies.length - 1; k$1 >= 0; k$1--) {
3323 if (expectedFrequencies[k$1] < 3) {
3324 expectedFrequencies[k$1 - 1] += expectedFrequencies[k$1];
3325 expectedFrequencies.pop();
3326
3327 observedFrequencies[k$1 - 1] += observedFrequencies[k$1];
3328 observedFrequencies.pop();
3329 }
3330 }
3331
3332 // Iterate through the squared differences between observed & expected
3333 // frequencies, accumulating the `chiSquared` statistic.
3334 for (var k$2 = 0; k$2 < observedFrequencies.length; k$2++) {
3335 chiSquared +=
3336 Math.pow(observedFrequencies[k$2] - expectedFrequencies[k$2], 2) /
3337 expectedFrequencies[k$2];
3338 }
3339
3340 // Calculate degrees of freedom for this test and look it up in the
3341 // `chiSquaredDistributionTable` in order to
3342 // accept or reject the goodness-of-fit of the hypothesized distribution.
3343 // Degrees of freedom, calculated as (number of class intervals -
3344 // number of hypothesized distribution parameters estimated - 1)
3345 var degreesOfFreedom = observedFrequencies.length - c - 1;
3346 return (
3347 chiSquaredDistributionTable[degreesOfFreedom][significance] < chiSquared
3348 );
3349}
3350
3351var SQRT_2PI = Math.sqrt(2 * Math.PI);
3352
3353/**
3354 * [Well-known kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use)
3355 * @private
3356 */
3357var kernels = {
3358 /**
3359 * The gaussian kernel.
3360 * @private
3361 */
3362 gaussian: function (u) {
3363 return Math.exp(-0.5 * u * u) / SQRT_2PI;
3364 }
3365};
3366
3367/**
3368 * Well known bandwidth selection methods
3369 * @private
3370 */
3371var bandwidthMethods = {
3372 /**
3373 * The ["normal reference distribution"
3374 * rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html),
3375 * a commonly used version of [Silverman's
3376 * rule-of-thumb](https://en.wikipedia.org/wiki/Kernel_density_estimation#A_rule-of-thumb_bandwidth_estimator).
3377 * @private
3378 */
3379 nrd: function (x) {
3380 var s = sampleStandardDeviation(x);
3381 var iqr = interquartileRange(x);
3382 if (typeof iqr === "number") {
3383 s = Math.min(s, iqr / 1.34);
3384 }
3385 return 1.06 * s * Math.pow(x.length, -0.2);
3386 }
3387};
3388
3389/**
3390 * [Kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation)
3391 * is a useful tool for, among other things, estimating the shape of the
3392 * underlying probability distribution from a sample.
3393 *
3394 * @name kernelDensityEstimation
3395 * @param X sample values
3396 * @param kernel The kernel function to use. If a function is provided, it should return non-negative values and integrate to 1. Defaults to 'gaussian'.
3397 * @param bandwidthMethod The "bandwidth selection" method to use, or a fixed bandwidth value. Defaults to "nrd", the commonly-used ["normal reference distribution" rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html).
3398 * @returns {Function} An estimated [probability density function](https://en.wikipedia.org/wiki/Probability_density_function) for the given sample. The returned function runs in `O(X.length)`.
3399 */
3400function kernelDensityEstimation(X, kernel, bandwidthMethod) {
3401 var kernelFn;
3402 if (kernel === undefined) {
3403 kernelFn = kernels.gaussian;
3404 } else if (typeof kernel === "string") {
3405 if (!kernels[kernel]) {
3406 throw new Error('Unknown kernel "' + kernel + '"');
3407 }
3408 kernelFn = kernels[kernel];
3409 } else {
3410 kernelFn = kernel;
3411 }
3412
3413 var bandwidth;
3414 if (typeof bandwidthMethod === "undefined") {
3415 bandwidth = bandwidthMethods.nrd(X);
3416 } else if (typeof bandwidthMethod === "string") {
3417 if (!bandwidthMethods[bandwidthMethod]) {
3418 throw new Error(
3419 'Unknown bandwidth method "' + bandwidthMethod + '"'
3420 );
3421 }
3422 bandwidth = bandwidthMethods[bandwidthMethod](X);
3423 } else {
3424 bandwidth = bandwidthMethod;
3425 }
3426
3427 return function (x) {
3428 var i = 0;
3429 var sum = 0;
3430 for (i = 0; i < X.length; i++) {
3431 sum += kernelFn((x - X[i]) / bandwidth);
3432 }
3433 return sum / bandwidth / X.length;
3434 };
3435}
3436
3437/**
3438 * The [Z-Score, or Standard Score](http://en.wikipedia.org/wiki/Standard_score).
3439 *
3440 * The standard score is the number of standard deviations an observation
3441 * or datum is above or below the mean. Thus, a positive standard score
3442 * represents a datum above the mean, while a negative standard score
3443 * represents a datum below the mean. It is a dimensionless quantity
3444 * obtained by subtracting the population mean from an individual raw
3445 * score and then dividing the difference by the population standard
3446 * deviation.
3447 *
3448 * The z-score is only defined if one knows the population parameters;
3449 * if one only has a sample set, then the analogous computation with
3450 * sample mean and sample standard deviation yields the
3451 * Student's t-statistic.
3452 *
3453 * @param {number} x
3454 * @param {number} mean
3455 * @param {number} standardDeviation
3456 * @return {number} z score
3457 * @example
3458 * zScore(78, 80, 5); // => -0.4
3459 */
3460function zScore(x, mean, standardDeviation) {
3461 return (x - mean) / standardDeviation;
3462}
3463
3464var SQRT_2PI$1 = Math.sqrt(2 * Math.PI);
3465
3466function cumulativeDistribution(z) {
3467 var sum = z,
3468 tmp = z;
3469
3470 // 15 iterations are enough for 4-digit precision
3471 for (var i = 1; i < 15; i++) {
3472 tmp *= (z * z) / (2 * i + 1);
3473 sum += tmp;
3474 }
3475 return (
3476 Math.round((0.5 + (sum / SQRT_2PI$1) * Math.exp((-z * z) / 2)) * 1e4) /
3477 1e4
3478 );
3479}
3480
3481/**
3482 * A standard normal table, also called the unit normal table or Z table,
3483 * is a mathematical table for the values of Φ (phi), which are the values of
3484 * the cumulative distribution function of the normal distribution.
3485 * It is used to find the probability that a statistic is observed below,
3486 * above, or between values on the standard normal distribution, and by
3487 * extension, any normal distribution.
3488 *
3489 * The probabilities are calculated using the
3490 * [Cumulative distribution function](https://en.wikipedia.org/wiki/Normal_distribution#Cumulative_distribution_function).
3491 * The table used is the cumulative, and not cumulative from 0 to mean
3492 * (even though the latter has 5 digits precision, instead of 4).
3493 */
3494var standardNormalTable = [];
3495
3496for (var z = 0; z <= 3.09; z += 0.01) {
3497 standardNormalTable.push(cumulativeDistribution(z));
3498}
3499
3500/**
3501 * **[Cumulative Standard Normal Probability](http://en.wikipedia.org/wiki/Standard_normal_table)**
3502 *
3503 * Since probability tables cannot be
3504 * printed for every normal distribution, as there are an infinite variety
3505 * of normal distributions, it is common practice to convert a normal to a
3506 * standard normal and then use the standard normal table to find probabilities.
3507 *
3508 * You can use `.5 + .5 * errorFunction(x / Math.sqrt(2))` to calculate the probability
3509 * instead of looking it up in a table.
3510 *
3511 * @param {number} z
3512 * @returns {number} cumulative standard normal probability
3513 */
3514function cumulativeStdNormalProbability(z) {
3515 // Calculate the position of this value.
3516 var absZ = Math.abs(z);
3517 // Each row begins with a different
3518 // significant digit: 0.5, 0.6, 0.7, and so on. Each value in the table
3519 // corresponds to a range of 0.01 in the input values, so the value is
3520 // multiplied by 100.
3521 var index = Math.min(
3522 Math.round(absZ * 100),
3523 standardNormalTable.length - 1
3524 );
3525
3526 // The index we calculate must be in the table as a positive value,
3527 // but we still pay attention to whether the input is positive
3528 // or negative, and flip the output value as a last step.
3529 if (z >= 0) {
3530 return standardNormalTable[index];
3531 } else {
3532 // due to floating-point arithmetic, values in the table with
3533 // 4 significant figures can nevertheless end up as repeating
3534 // fractions when they're computed here.
3535 return +(1 - standardNormalTable[index]).toFixed(4);
3536 }
3537}
3538
3539/**
3540 * **[Gaussian error function](http://en.wikipedia.org/wiki/Error_function)**
3541 *
3542 * The `errorFunction(x/(sd * Math.sqrt(2)))` is the probability that a value in a
3543 * normal distribution with standard deviation sd is within x of the mean.
3544 *
3545 * This function returns a numerical approximation to the exact value.
3546 * It uses Horner's method to evaluate the polynomial of τ (tau).
3547 *
3548 * @param {number} x input
3549 * @return {number} error estimation
3550 * @example
3551 * errorFunction(1).toFixed(2); // => '0.84'
3552 */
3553function errorFunction(x) {
3554 var t = 1 / (1 + 0.5 * Math.abs(x));
3555 var tau =
3556 t *
3557 Math.exp(
3558 -x * x +
3559 ((((((((0.17087277 * t - 0.82215223) * t + 1.48851587) * t -
3560 1.13520398) *
3561 t +
3562 0.27886807) *
3563 t -
3564 0.18628806) *
3565 t +
3566 0.09678418) *
3567 t +
3568 0.37409196) *
3569 t +
3570 1.00002368) *
3571 t -
3572 1.26551223
3573 );
3574 if (x >= 0) {
3575 return 1 - tau;
3576 } else {
3577 return tau - 1;
3578 }
3579}
3580
3581/**
3582 * The Inverse [Gaussian error function](http://en.wikipedia.org/wiki/Error_function)
3583 * returns a numerical approximation to the value that would have caused
3584 * `errorFunction()` to return x.
3585 *
3586 * @param {number} x value of error function
3587 * @returns {number} estimated inverted value
3588 */
3589function inverseErrorFunction(x) {
3590 var a = (8 * (Math.PI - 3)) / (3 * Math.PI * (4 - Math.PI));
3591
3592 var inv = Math.sqrt(
3593 Math.sqrt(
3594 Math.pow(2 / (Math.PI * a) + Math.log(1 - x * x) / 2, 2) -
3595 Math.log(1 - x * x) / a
3596 ) -
3597 (2 / (Math.PI * a) + Math.log(1 - x * x) / 2)
3598 );
3599
3600 if (x >= 0) {
3601 return inv;
3602 } else {
3603 return -inv;
3604 }
3605}
3606
3607/**
3608 * The [Probit](http://en.wikipedia.org/wiki/Probit)
3609 * is the inverse of cumulativeStdNormalProbability(),
3610 * and is also known as the normal quantile function.
3611 *
3612 * It returns the number of standard deviations from the mean
3613 * where the p'th quantile of values can be found in a normal distribution.
3614 * So, for example, probit(0.5 + 0.6827/2) ≈ 1 because 68.27% of values are
3615 * normally found within 1 standard deviation above or below the mean.
3616 *
3617 * @param {number} p
3618 * @returns {number} probit
3619 */
3620function probit(p) {
3621 if (p === 0) {
3622 p = epsilon;
3623 } else if (p >= 1) {
3624 p = 1 - epsilon;
3625 }
3626 return Math.sqrt(2) * inverseErrorFunction(2 * p - 1);
3627}
3628
3629/**
3630 * Conducts a [permutation test](https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests)
3631 * to determine if two data sets are *significantly* different from each other, using
3632 * the difference of means between the groups as the test statistic.
3633 * The function allows for the following hypotheses:
3634 * - two_tail = Null hypothesis: the two distributions are equal.
3635 * - greater = Null hypothesis: observations from sampleX tend to be smaller than those from sampleY.
3636 * - less = Null hypothesis: observations from sampleX tend to be greater than those from sampleY.
3637 * [Learn more about one-tail vs two-tail tests.](https://en.wikipedia.org/wiki/One-_and_two-tailed_tests)
3638 *
3639 * @param {Array<number>} sampleX first dataset (e.g. treatment data)
3640 * @param {Array<number>} sampleY second dataset (e.g. control data)
3641 * @param {string} alternative alternative hypothesis, either 'two_sided' (default), 'greater', or 'less'
3642 * @param {number} k number of values in permutation distribution.
3643 * @param {Function} [randomSource=Math.random] an optional entropy source
3644 * @returns {number} p-value The probability of observing the difference between groups (as or more extreme than what we did), assuming the null hypothesis.
3645 *
3646 * @example
3647 * var control = [2, 5, 3, 6, 7, 2, 5];
3648 * var treatment = [20, 5, 13, 12, 7, 2, 2];
3649 * permutationTest(control, treatment); // ~0.1324
3650 */
3651function permutationTest(sampleX, sampleY, alternative, k, randomSource) {
3652 // Set default arguments
3653 if (k === undefined) {
3654 k = 10000;
3655 }
3656 if (alternative === undefined) {
3657 alternative = "two_side";
3658 }
3659 if (
3660 alternative !== "two_side" &&
3661 alternative !== "greater" &&
3662 alternative !== "less"
3663 ) {
3664 throw new Error(
3665 "`alternative` must be either 'two_side', 'greater', or 'less'"
3666 );
3667 }
3668
3669 // get means for each sample
3670 var meanX = mean(sampleX);
3671 var meanY = mean(sampleY);
3672
3673 // calculate initial test statistic. This will be our point of comparison with
3674 // the generated test statistics.
3675 var testStatistic = meanX - meanY;
3676
3677 // create test-statistic distribution
3678 var testStatDsn = new Array(k);
3679
3680 // combine datsets so we can easily shuffle later
3681 var allData = sampleX.concat(sampleY);
3682 var midIndex = Math.floor(allData.length / 2);
3683
3684 for (var i = 0; i < k; i++) {
3685 // 1. shuffle data assignments
3686 shuffleInPlace(allData, randomSource);
3687 var permLeft = allData.slice(0, midIndex);
3688 var permRight = allData.slice(midIndex, allData.length);
3689
3690 // 2.re-calculate test statistic
3691 var permTestStatistic = mean(permLeft) - mean(permRight);
3692
3693 // 3. store test statistic to build test statistic distribution
3694 testStatDsn[i] = permTestStatistic;
3695 }
3696
3697 // Calculate p-value depending on alternative
3698 // For this test, we calculate the percentage of 'extreme' test statistics (subject to our hypothesis)
3699 // more info on permutation test p-value calculations: https://onlinecourses.science.psu.edu/stat464/node/35
3700 var numExtremeTStats = 0;
3701 if (alternative === "two_side") {
3702 for (var i$1 = 0; i$1 <= k; i$1++) {
3703 if (Math.abs(testStatDsn[i$1]) >= Math.abs(testStatistic)) {
3704 numExtremeTStats += 1;
3705 }
3706 }
3707 } else if (alternative === "greater") {
3708 for (var i$2 = 0; i$2 <= k; i$2++) {
3709 if (testStatDsn[i$2] >= testStatistic) {
3710 numExtremeTStats += 1;
3711 }
3712 }
3713 } else {
3714 // alternative === 'less'
3715 for (var i$3 = 0; i$3 <= k; i$3++) {
3716 if (testStatDsn[i$3] <= testStatistic) {
3717 numExtremeTStats += 1;
3718 }
3719 }
3720 }
3721
3722 return numExtremeTStats / k;
3723}
3724
3725/**
3726 * [Sign](https://en.wikipedia.org/wiki/Sign_function) is a function
3727 * that extracts the sign of a real number
3728 *
3729 * @param {number} x input value
3730 * @returns {number} sign value either 1, 0 or -1
3731 * @throws {TypeError} if the input argument x is not a number
3732 * @private
3733 *
3734 * @example
3735 * sign(2); // => 1
3736 */
3737function sign(x) {
3738 if (typeof x === "number") {
3739 if (x < 0) {
3740 return -1;
3741 } else if (x === 0) {
3742 return 0;
3743 } else {
3744 return 1;
3745 }
3746 } else {
3747 throw new TypeError("not a number");
3748 }
3749}
3750
3751/**
3752 * [Bisection method](https://en.wikipedia.org/wiki/Bisection_method) is a root-finding
3753 * method that repeatedly bisects an interval to find the root.
3754 *
3755 * This function returns a numerical approximation to the exact value.
3756 *
3757 * @param {Function} func input function
3758 * @param {number} start - start of interval
3759 * @param {number} end - end of interval
3760 * @param {number} maxIterations - the maximum number of iterations
3761 * @param {number} errorTolerance - the error tolerance
3762 * @returns {number} estimated root value
3763 * @throws {TypeError} Argument func must be a function
3764 *
3765 * @example
3766 * bisect(Math.cos,0,4,100,0.003); // => 1.572265625
3767 */
3768function bisect(func, start, end, maxIterations, errorTolerance) {
3769 if (typeof func !== "function")
3770 { throw new TypeError("func must be a function"); }
3771
3772 for (var i = 0; i < maxIterations; i++) {
3773 var output = (start + end) / 2;
3774
3775 if (
3776 func(output) === 0 ||
3777 Math.abs((end - start) / 2) < errorTolerance
3778 ) {
3779 return output;
3780 }
3781
3782 if (sign(func(output)) === sign(func(start))) {
3783 start = output;
3784 } else {
3785 end = output;
3786 }
3787 }
3788
3789 throw new Error("maximum number of iterations exceeded");
3790}
3791
3792/**
3793 * Calculate Euclidean distance between two points.
3794 * @param {Array<number>} left First N-dimensional point.
3795 * @param {Array<number>} right Second N-dimensional point.
3796 * @returns {number} Distance.
3797 */
3798function euclideanDistance(left, right) {
3799 var sum = 0;
3800 for (var i = 0; i < left.length; i++) {
3801 var diff = left[i] - right[i];
3802 sum += diff * diff;
3803 }
3804 return Math.sqrt(sum);
3805}
3806
3807/**
3808 * @typedef {Object} kMeansReturn
3809 * @property {Array<number>} labels The labels.
3810 * @property {Array<Array<number>>} centroids The cluster centroids.
3811 */
3812
3813/**
3814 * Perform k-means clustering.
3815 *
3816 * @param {Array<Array<number>>} points N-dimensional coordinates of points to be clustered.
3817 * @param {number} numCluster How many clusters to create.
3818 * @param {Function} randomSource An optional entropy source that generates uniform values in [0, 1).
3819 * @return {kMeansReturn} Labels (same length as data) and centroids (same length as numCluster).
3820 * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
3821 *
3822 * @example
3823 * kMeansCluster([[0.0, 0.5], [1.0, 0.5]], 2); // => {labels: [0, 1], centroids: [[0.0, 0.5], [1.0 0.5]]}
3824 */
3825function kMeansCluster(points, numCluster, randomSource) {
3826 if ( randomSource === void 0 ) randomSource = Math.random;
3827
3828 var oldCentroids = null;
3829 var newCentroids = sample(points, numCluster, randomSource);
3830 var labels = null;
3831 var change = Number.MAX_VALUE;
3832 while (change !== 0) {
3833 labels = labelPoints(points, newCentroids);
3834 oldCentroids = newCentroids;
3835 newCentroids = calculateCentroids(points, labels, numCluster);
3836 change = calculateChange(newCentroids, oldCentroids);
3837 }
3838 return {
3839 labels: labels,
3840 centroids: newCentroids
3841 };
3842}
3843
3844/**
3845 * Label each point according to which centroid it is closest to.
3846 *
3847 * @private
3848 * @param {Array<Array<number>>} points Array of XY coordinates.
3849 * @param {Array<Array<number>>} centroids Current centroids.
3850 * @return {Array<number>} Group labels.
3851 */
3852function labelPoints(points, centroids) {
3853 return points.map(function (p) {
3854 var minDist = Number.MAX_VALUE;
3855 var label = -1;
3856 for (var i = 0; i < centroids.length; i++) {
3857 var dist = euclideanDistance(p, centroids[i]);
3858 if (dist < minDist) {
3859 minDist = dist;
3860 label = i;
3861 }
3862 }
3863 return label;
3864 });
3865}
3866
3867/**
3868 * Calculate centroids for points given labels.
3869 *
3870 * @private
3871 * @param {Array<Array<number>>} points Array of XY coordinates.
3872 * @param {Array<number>} labels Which groups points belong to.
3873 * @param {number} numCluster Number of clusters being created.
3874 * @return {Array<Array<number>>} Centroid for each group.
3875 * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
3876 */
3877function calculateCentroids(points, labels, numCluster) {
3878 // Initialize accumulators.
3879 var dimension = points[0].length;
3880 var centroids = makeMatrix(numCluster, dimension);
3881 var counts = Array(numCluster).fill(0);
3882
3883 // Add points to centroids' accumulators and count points per centroid.
3884 var numPoints = points.length;
3885 for (var i = 0; i < numPoints; i++) {
3886 var point = points[i];
3887 var label = labels[i];
3888 var current = centroids[label];
3889 for (var j = 0; j < dimension; j++) {
3890 current[j] += point[j];
3891 }
3892 counts[label] += 1;
3893 }
3894
3895 // Rescale centroids, checking for any that have no points.
3896 for (var i$1 = 0; i$1 < numCluster; i$1++) {
3897 if (counts[i$1] === 0) {
3898 throw new Error(("Centroid " + i$1 + " has no friends"));
3899 }
3900 var centroid = centroids[i$1];
3901 for (var j$1 = 0; j$1 < dimension; j$1++) {
3902 centroid[j$1] /= counts[i$1];
3903 }
3904 }
3905
3906 return centroids;
3907}
3908
3909/**
3910 * Calculate the difference between old centroids and new centroids.
3911 *
3912 * @private
3913 * @param {Array<Array<number>>} left One list of centroids.
3914 * @param {Array<Array<number>>} right Another list of centroids.
3915 * @return {number} Distance between centroids.
3916 */
3917function calculateChange(left, right) {
3918 var total = 0;
3919 for (var i = 0; i < left.length; i++) {
3920 total += euclideanDistance(left[i], right[i]);
3921 }
3922 return total;
3923}
3924
3925/**
3926 * Calculate the [silhouette values](https://en.wikipedia.org/wiki/Silhouette_(clustering))
3927 * for clustered data.
3928 *
3929 * @param {Array<Array<number>>} points N-dimensional coordinates of points.
3930 * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
3931 * and values must lie in [0..G-1], where G is the number of groups.
3932 * @return {Array<number>} The silhouette value for each point.
3933 *
3934 * @example
3935 * silhouette([[0.25], [0.75]], [0, 0]); // => [1.0, 1.0]
3936 */
3937function silhouette(points, labels) {
3938 if (points.length !== labels.length) {
3939 throw new Error("must have exactly as many labels as points");
3940 }
3941 var groupings = createGroups(labels);
3942 var distances = calculateAllDistances(points);
3943 var result = [];
3944 for (var i = 0; i < points.length; i++) {
3945 var s = 0;
3946 if (groupings[labels[i]].length > 1) {
3947 var a = meanDistanceFromPointToGroup(
3948 i,
3949 groupings[labels[i]],
3950 distances
3951 );
3952 var b = meanDistanceToNearestGroup(
3953 i,
3954 labels,
3955 groupings,
3956 distances
3957 );
3958 s = (b - a) / Math.max(a, b);
3959 }
3960 result.push(s);
3961 }
3962 return result;
3963}
3964
3965/**
3966 * Create a lookup table mapping group IDs to point IDs.
3967 *
3968 * @private
3969 * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
3970 * and values must lie in [0..G-1], where G is the number of groups.
3971 * @return {Array<Array<number>>} An array of length G, each of whose entries is an array
3972 * containing the indices of the points in that group.
3973 */
3974function createGroups(labels) {
3975 var numGroups = 1 + max(labels);
3976 var result = Array(numGroups);
3977 for (var i = 0; i < labels.length; i++) {
3978 var label = labels[i];
3979 if (result[label] === undefined) {
3980 result[label] = [];
3981 }
3982 result[label].push(i);
3983 }
3984 return result;
3985}
3986
3987/**
3988 * Create a lookup table of all inter-point distances.
3989 *
3990 * @private
3991 * @param {Array<Array<number>>} points N-dimensional coordinates of points.
3992 * @return {Array<Array<number>>} A symmetric square array of inter-point distances
3993 * (zero on the diagonal).
3994 */
3995function calculateAllDistances(points) {
3996 var numPoints = points.length;
3997 var result = makeMatrix(numPoints, numPoints);
3998 for (var i = 0; i < numPoints; i++) {
3999 for (var j = 0; j < i; j++) {
4000 result[i][j] = euclideanDistance(points[i], points[j]);
4001 result[j][i] = result[i][j];
4002 }
4003 }
4004 return result;
4005}
4006
4007/**
4008 * Calculate the mean distance between this point and all the points in the
4009 * nearest group (as determined by which point in another group is closest).
4010 *
4011 * @private
4012 * @param {number} which The index of this point.
4013 * @param {Array<number>} labels Labels of points.
4014 * @param {Array<Array<number>>} groupings An array whose entries are arrays
4015 * containing the indices of the points in that group.
4016 * @param {Array<Array<number>>} distances A symmetric square array of inter-point
4017 * distances.
4018 * @return {number} The mean distance from this point to others in the nearest
4019 * group.
4020 */
4021function meanDistanceToNearestGroup(which, labels, groupings, distances) {
4022 var label = labels[which];
4023 var result = Number.MAX_VALUE;
4024 for (var i = 0; i < groupings.length; i++) {
4025 if (i !== label) {
4026 var d = meanDistanceFromPointToGroup(
4027 which,
4028 groupings[i],
4029 distances
4030 );
4031 if (d < result) {
4032 result = d;
4033 }
4034 }
4035 }
4036 return result;
4037}
4038
4039/**
4040 * Calculate the mean distance between a point and all the points in a group
4041 * (possibly its own).
4042 *
4043 * @private
4044 * @param {number} which The index of this point.
4045 * @param {Array<number>} group The indices of all the points in the group in
4046 * question.
4047 * @param {Array<Array<number>>} distances A symmetric square array of inter-point
4048 * distances.
4049 * @return {number} The mean distance from this point to others in the
4050 * specified group.
4051 */
4052function meanDistanceFromPointToGroup(which, group, distances) {
4053 var total = 0;
4054 for (var i = 0; i < group.length; i++) {
4055 total += distances[which][group[i]];
4056 }
4057 return total / group.length;
4058}
4059
4060/**
4061 * Calculate the [silhouette metric](https://en.wikipedia.org/wiki/Silhouette_(clustering))
4062 * for a set of N-dimensional points arranged in groups. The metric is the largest
4063 * individual silhouette value for the data.
4064 *
4065 * @param {Array<Array<number>>} points N-dimensional coordinates of points.
4066 * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
4067 * and values must lie in [0..G-1], where G is the number of groups.
4068 * @return {number} The silhouette metric for the groupings.
4069 *
4070 * @example
4071 * silhouetteMetric([[0.25], [0.75]], [0, 0]); // => 1.0
4072 */
4073function silhouetteMetric(points, labels) {
4074 var values = silhouette(points, labels);
4075 return max(values);
4076}
4077
4078/**
4079 * Relative error.
4080 *
4081 * This is more difficult to calculate than it first appears [1,2]. The usual
4082 * formula for the relative error between an actual value A and an expected
4083 * value E is `|(A-E)/E|`, but:
4084 *
4085 * 1. If the expected value is 0, any other value has infinite relative error,
4086 * which is counter-intuitive: if the expected voltage is 0, getting 1/10th
4087 * of a volt doesn't feel like an infinitely large error.
4088 *
4089 * 2. This formula does not satisfy the mathematical definition of a metric [3].
4090 * [4] solved this problem by defining the relative error as `|ln(|A/E|)|`,
4091 * but that formula only works if all values are positive: for example, it
4092 * reports the relative error of -10 and 10 as 0.
4093 *
4094 * Our implementation sticks with convention and returns:
4095 *
4096 * - 0 if the actual and expected values are both zero
4097 * - Infinity if the actual value is non-zero and the expected value is zero
4098 * - `|(A-E)/E|` in all other cases
4099 *
4100 * [1] https://math.stackexchange.com/questions/677852/how-to-calculate-relative-error-when-true-value-is-zero
4101 * [2] https://en.wikipedia.org/wiki/Relative_change_and_difference
4102 * [3] https://en.wikipedia.org/wiki/Metric_(mathematics)#Definition
4103 * [4] F.W.J. Olver: "A New Approach to Error Arithmetic." SIAM Journal on
4104 * Numerical Analysis, 15(2), 1978, 10.1137/0715024.
4105 *
4106 * @param {number} actual The actual value.
4107 * @param {number} expected The expected value.
4108 * @return {number} The relative error.
4109 */
4110function relativeError(actual, expected) {
4111 if (actual === 0 && expected === 0) {
4112 return 0;
4113 }
4114 return Math.abs((actual - expected) / expected);
4115}
4116
4117/**
4118 * Approximate equality.
4119 *
4120 * @param {number} actual The value to be tested.
4121 * @param {number} expected The reference value.
4122 * @param {number} tolerance The acceptable relative difference.
4123 * @return {boolean} Whether numbers are within tolerance.
4124 */
4125function approxEqual(actual, expected, tolerance) {
4126 if ( tolerance === void 0 ) tolerance = epsilon;
4127
4128 return relativeError(actual, expected) <= tolerance;
4129}
4130
4131export { BayesianClassifier, PerceptronModel, addToMean, approxEqual, mean as average, meanSimple as averageSimple, BayesianClassifier as bayesian, bernoulliDistribution, binomialDistribution, bisect, chiSquaredDistributionTable, chiSquaredGoodnessOfFit, chunk, ckmeans, combinations, combinationsReplacement, combineMeans, combineVariances, cumulativeStdNormalProbability, epsilon, equalIntervalBreaks, errorFunction as erf, errorFunction, extent, extentSorted, factorial, gamma, gammaln, geometricMean, harmonicMean, interquartileRange, inverseErrorFunction, interquartileRange as iqr, kMeansCluster, kernelDensityEstimation as kde, kernelDensityEstimation, linearRegression, linearRegressionLine, medianAbsoluteDeviation as mad, max, maxSorted, mean, meanSimple, median, medianAbsoluteDeviation, medianSorted, min, minSorted, mode, modeFast, modeSorted, numericSort, PerceptronModel as perceptron, permutationTest, permutationsHeap, poissonDistribution, probit, product, quantile, quantileRank, quantileRankSorted, quantileSorted, quickselect, rSquared, relativeError, rootMeanSquare as rms, rootMeanSquare, sample, sampleCorrelation, sampleCovariance, sampleKurtosis, sampleSkewness, sampleStandardDeviation, sampleVariance, sampleWithReplacement, shuffle, shuffleInPlace, sign, silhouette, silhouetteMetric, standardDeviation, standardNormalTable, subtractFromMean, sum, sumNthPowerDeviations, sumSimple, tTest, tTestTwoSample, uniqueCountSorted, variance, zScore };
4132//# sourceMappingURL=simple-statistics.mjs.map