1 | /**
|
2 | * [Simple linear regression](http://en.wikipedia.org/wiki/Simple_linear_regression)
|
3 | * is a simple way to find a fitted line
|
4 | * between a set of coordinates. This algorithm finds the slope and y-intercept of a regression line
|
5 | * using the least sum of squares.
|
6 | *
|
7 | * @param {Array<Array<number>>} data an array of two-element of arrays,
|
8 | * like `[[0, 1], [2, 3]]`
|
9 | * @returns {Object} object containing slope and intersect of regression line
|
10 | * @example
|
11 | * linearRegression([[0, 0], [1, 1]]); // => { m: 1, b: 0 }
|
12 | */
|
13 | function linearRegression(data) {
|
14 | var m, b;
|
15 |
|
16 | // Store data length in a local variable to reduce
|
17 | // repeated object property lookups
|
18 | var dataLength = data.length;
|
19 |
|
20 | //if there's only one point, arbitrarily choose a slope of 0
|
21 | //and a y-intercept of whatever the y of the initial point is
|
22 | if (dataLength === 1) {
|
23 | m = 0;
|
24 | b = data[0][1];
|
25 | } else {
|
26 | // Initialize our sums and scope the `m` and `b`
|
27 | // variables that define the line.
|
28 | var sumX = 0,
|
29 | sumY = 0,
|
30 | sumXX = 0,
|
31 | sumXY = 0;
|
32 |
|
33 | // Use local variables to grab point values
|
34 | // with minimal object property lookups
|
35 | var point, x, y;
|
36 |
|
37 | // Gather the sum of all x values, the sum of all
|
38 | // y values, and the sum of x^2 and (x*y) for each
|
39 | // value.
|
40 | //
|
41 | // In math notation, these would be SS_x, SS_y, SS_xx, and SS_xy
|
42 | for (var i = 0; i < dataLength; i++) {
|
43 | point = data[i];
|
44 | x = point[0];
|
45 | y = point[1];
|
46 |
|
47 | sumX += x;
|
48 | sumY += y;
|
49 |
|
50 | sumXX += x * x;
|
51 | sumXY += x * y;
|
52 | }
|
53 |
|
54 | // `m` is the slope of the regression line
|
55 | m =
|
56 | (dataLength * sumXY - sumX * sumY) /
|
57 | (dataLength * sumXX - sumX * sumX);
|
58 |
|
59 | // `b` is the y-intercept of the line.
|
60 | b = sumY / dataLength - (m * sumX) / dataLength;
|
61 | }
|
62 |
|
63 | // Return both values as an object.
|
64 | return {
|
65 | m: m,
|
66 | b: b
|
67 | };
|
68 | }
|
69 |
|
70 | /**
|
71 | * Given the output of `linearRegression`: an object
|
72 | * with `m` and `b` values indicating slope and intercept,
|
73 | * respectively, generate a line function that translates
|
74 | * x values into y values.
|
75 | *
|
76 | * @param {Object} mb object with `m` and `b` members, representing
|
77 | * slope and intersect of desired line
|
78 | * @returns {Function} method that computes y-value at any given
|
79 | * x-value on the line.
|
80 | * @example
|
81 | * var l = linearRegressionLine(linearRegression([[0, 0], [1, 1]]));
|
82 | * l(0) // = 0
|
83 | * l(2) // = 2
|
84 | * linearRegressionLine({ b: 0, m: 1 })(1); // => 1
|
85 | * linearRegressionLine({ b: 1, m: 1 })(1); // => 2
|
86 | */
|
87 | function linearRegressionLine(mb /*: { b: number, m: number }*/) {
|
88 | // Return a function that computes a `y` value for each
|
89 | // x value it is given, based on the values of `b` and `a`
|
90 | // that we just computed.
|
91 | return function (x) {
|
92 | return mb.b + mb.m * x;
|
93 | };
|
94 | }
|
95 |
|
96 | /**
|
97 | * Our default sum is the [Kahan-Babuska algorithm](https://pdfs.semanticscholar.org/1760/7d467cda1d0277ad272deb2113533131dc09.pdf).
|
98 | * This method is an improvement over the classical
|
99 | * [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
|
100 | * It aims at computing the sum of a list of numbers while correcting for
|
101 | * floating-point errors. Traditionally, sums are calculated as many
|
102 | * successive additions, each one with its own floating-point roundoff. These
|
103 | * losses in precision add up as the number of numbers increases. This alternative
|
104 | * algorithm is more accurate than the simple way of calculating sums by simple
|
105 | * addition.
|
106 | *
|
107 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
108 | *
|
109 | * @param {Array<number>} x input
|
110 | * @return {number} sum of all input numbers
|
111 | * @example
|
112 | * sum([1, 2, 3]); // => 6
|
113 | */
|
114 | function sum(x) {
|
115 | // If the array is empty, we needn't bother computing its sum
|
116 | if (x.length === 0) {
|
117 | return 0;
|
118 | }
|
119 |
|
120 | // Initializing the sum as the first number in the array
|
121 | var sum = x[0];
|
122 |
|
123 | // Keeping track of the floating-point error correction
|
124 | var correction = 0;
|
125 |
|
126 | var transition;
|
127 |
|
128 | for (var i = 1; i < x.length; i++) {
|
129 | transition = sum + x[i];
|
130 |
|
131 | // Here we need to update the correction in a different fashion
|
132 | // if the new absolute value is greater than the absolute sum
|
133 | if (Math.abs(sum) >= Math.abs(x[i])) {
|
134 | correction += sum - transition + x[i];
|
135 | } else {
|
136 | correction += x[i] - transition + sum;
|
137 | }
|
138 |
|
139 | sum = transition;
|
140 | }
|
141 |
|
142 | // Returning the corrected sum
|
143 | return sum + correction;
|
144 | }
|
145 |
|
146 | /**
|
147 | * The mean, _also known as average_,
|
148 | * is the sum of all values over the number of values.
|
149 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
150 | * a method of finding a typical or central value of a set of numbers.
|
151 | *
|
152 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
153 | *
|
154 | * @param {Array<number>} x sample of one or more data points
|
155 | * @throws {Error} if the the length of x is less than one
|
156 | * @returns {number} mean
|
157 | * @example
|
158 | * mean([0, 10]); // => 5
|
159 | */
|
160 | function mean(x) {
|
161 | if (x.length === 0) {
|
162 | throw new Error("mean requires at least one data point");
|
163 | }
|
164 |
|
165 | return sum(x) / x.length;
|
166 | }
|
167 |
|
168 | /**
|
169 | * The sum of deviations to the Nth power.
|
170 | * When n=2 it's the sum of squared deviations.
|
171 | * When n=3 it's the sum of cubed deviations.
|
172 | *
|
173 | * @param {Array<number>} x
|
174 | * @param {number} n power
|
175 | * @returns {number} sum of nth power deviations
|
176 | *
|
177 | * @example
|
178 | * var input = [1, 2, 3];
|
179 | * // since the variance of a set is the mean squared
|
180 | * // deviations, we can calculate that with sumNthPowerDeviations:
|
181 | * sumNthPowerDeviations(input, 2) / input.length;
|
182 | */
|
183 | function sumNthPowerDeviations(x, n) {
|
184 | var meanValue = mean(x);
|
185 | var sum = 0;
|
186 | var tempValue;
|
187 | var i;
|
188 |
|
189 | // This is an optimization: when n is 2 (we're computing a number squared),
|
190 | // multiplying the number by itself is significantly faster than using
|
191 | // the Math.pow method.
|
192 | if (n === 2) {
|
193 | for (i = 0; i < x.length; i++) {
|
194 | tempValue = x[i] - meanValue;
|
195 | sum += tempValue * tempValue;
|
196 | }
|
197 | } else {
|
198 | for (i = 0; i < x.length; i++) {
|
199 | sum += Math.pow(x[i] - meanValue, n);
|
200 | }
|
201 | }
|
202 |
|
203 | return sum;
|
204 | }
|
205 |
|
206 | /**
|
207 | * The [variance](http://en.wikipedia.org/wiki/Variance)
|
208 | * is the sum of squared deviations from the mean.
|
209 | *
|
210 | * This is an implementation of variance, not sample variance:
|
211 | * see the `sampleVariance` method if you want a sample measure.
|
212 | *
|
213 | * @param {Array<number>} x a population of one or more data points
|
214 | * @returns {number} variance: a value greater than or equal to zero.
|
215 | * zero indicates that all values are identical.
|
216 | * @throws {Error} if x's length is 0
|
217 | * @example
|
218 | * variance([1, 2, 3, 4, 5, 6]); // => 2.9166666666666665
|
219 | */
|
220 | function variance(x) {
|
221 | if (x.length === 0) {
|
222 | throw new Error("variance requires at least one data point");
|
223 | }
|
224 |
|
225 | // Find the mean of squared deviations between the
|
226 | // mean value and each value.
|
227 | return sumNthPowerDeviations(x, 2) / x.length;
|
228 | }
|
229 |
|
230 | /**
|
231 | * The [standard deviation](http://en.wikipedia.org/wiki/Standard_deviation)
|
232 | * is the square root of the variance. This is also known as the population
|
233 | * standard deviation. It's useful for measuring the amount
|
234 | * of variation or dispersion in a set of values.
|
235 | *
|
236 | * Standard deviation is only appropriate for full-population knowledge: for
|
237 | * samples of a population, {@link sampleStandardDeviation} is
|
238 | * more appropriate.
|
239 | *
|
240 | * @param {Array<number>} x input
|
241 | * @returns {number} standard deviation
|
242 | * @example
|
243 | * variance([2, 4, 4, 4, 5, 5, 7, 9]); // => 4
|
244 | * standardDeviation([2, 4, 4, 4, 5, 5, 7, 9]); // => 2
|
245 | */
|
246 | function standardDeviation(x) {
|
247 | if (x.length === 1) {
|
248 | return 0;
|
249 | }
|
250 | var v = variance(x);
|
251 | return Math.sqrt(v);
|
252 | }
|
253 |
|
254 | /**
|
255 | * The [R Squared](http://en.wikipedia.org/wiki/Coefficient_of_determination)
|
256 | * value of data compared with a function `f`
|
257 | * is the sum of the squared differences between the prediction
|
258 | * and the actual value.
|
259 | *
|
260 | * @param {Array<Array<number>>} x input data: this should be doubly-nested
|
261 | * @param {Function} func function called on `[i][0]` values within the dataset
|
262 | * @returns {number} r-squared value
|
263 | * @example
|
264 | * var samples = [[0, 0], [1, 1]];
|
265 | * var regressionLine = linearRegressionLine(linearRegression(samples));
|
266 | * rSquared(samples, regressionLine); // = 1 this line is a perfect fit
|
267 | */
|
268 | function rSquared(x, func) {
|
269 | if (x.length < 2) {
|
270 | return 1;
|
271 | }
|
272 |
|
273 | // Compute the average y value for the actual
|
274 | // data set in order to compute the
|
275 | // _total sum of squares_
|
276 | var sum = 0;
|
277 | for (var i = 0; i < x.length; i++) {
|
278 | sum += x[i][1];
|
279 | }
|
280 | var average = sum / x.length;
|
281 |
|
282 | // Compute the total sum of squares - the
|
283 | // squared difference between each point
|
284 | // and the average of all points.
|
285 | var sumOfSquares = 0;
|
286 | for (var j = 0; j < x.length; j++) {
|
287 | sumOfSquares += Math.pow(average - x[j][1], 2);
|
288 | }
|
289 |
|
290 | // Finally estimate the error: the squared
|
291 | // difference between the estimate and the actual data
|
292 | // value at each point.
|
293 | var err = 0;
|
294 | for (var k = 0; k < x.length; k++) {
|
295 | err += Math.pow(x[k][1] - func(x[k][0]), 2);
|
296 | }
|
297 |
|
298 | // As the error grows larger, its ratio to the
|
299 | // sum of squares increases and the r squared
|
300 | // value grows lower.
|
301 | return 1 - err / sumOfSquares;
|
302 | }
|
303 |
|
304 | /**
|
305 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
306 | * that appears in a list the highest number of times.
|
307 | * There can be multiple modes in a list: in the event of a tie, this
|
308 | * algorithm will return the most recently seen mode.
|
309 | *
|
310 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
311 | * a method of finding a typical or central value of a set of numbers.
|
312 | *
|
313 | * This runs in `O(n)` because the input is sorted.
|
314 | *
|
315 | * @param {Array<number>} sorted a sample of one or more data points
|
316 | * @returns {number} mode
|
317 | * @throws {Error} if sorted is empty
|
318 | * @example
|
319 | * modeSorted([0, 0, 1]); // => 0
|
320 | */
|
321 | function modeSorted(sorted) {
|
322 | // Handle edge cases:
|
323 | // The mode of an empty list is undefined
|
324 | if (sorted.length === 0) {
|
325 | throw new Error("mode requires at least one data point");
|
326 | } else if (sorted.length === 1) {
|
327 | return sorted[0];
|
328 | }
|
329 |
|
330 | // This assumes it is dealing with an array of size > 1, since size
|
331 | // 0 and 1 are handled immediately. Hence it starts at index 1 in the
|
332 | // array.
|
333 | var last = sorted[0],
|
334 | // store the mode as we find new modes
|
335 | value = NaN,
|
336 | // store how many times we've seen the mode
|
337 | maxSeen = 0,
|
338 | // how many times the current candidate for the mode
|
339 | // has been seen
|
340 | seenThis = 1;
|
341 |
|
342 | // end at sorted.length + 1 to fix the case in which the mode is
|
343 | // the highest number that occurs in the sequence. the last iteration
|
344 | // compares sorted[i], which is undefined, to the highest number
|
345 | // in the series
|
346 | for (var i = 1; i < sorted.length + 1; i++) {
|
347 | // we're seeing a new number pass by
|
348 | if (sorted[i] !== last) {
|
349 | // the last number is the new mode since we saw it more
|
350 | // often than the old one
|
351 | if (seenThis > maxSeen) {
|
352 | maxSeen = seenThis;
|
353 | value = last;
|
354 | }
|
355 | seenThis = 1;
|
356 | last = sorted[i];
|
357 | // if this isn't a new number, it's one more occurrence of
|
358 | // the potential mode
|
359 | } else {
|
360 | seenThis++;
|
361 | }
|
362 | }
|
363 | return value;
|
364 | }
|
365 |
|
366 | /**
|
367 | * Sort an array of numbers by their numeric value, ensuring that the
|
368 | * array is not changed in place.
|
369 | *
|
370 | * This is necessary because the default behavior of .sort
|
371 | * in JavaScript is to sort arrays as string values
|
372 | *
|
373 | * [1, 10, 12, 102, 20].sort()
|
374 | * // output
|
375 | * [1, 10, 102, 12, 20]
|
376 | *
|
377 | * @param {Array<number>} x input array
|
378 | * @return {Array<number>} sorted array
|
379 | * @private
|
380 | * @example
|
381 | * numericSort([3, 2, 1]) // => [1, 2, 3]
|
382 | */
|
383 | function numericSort(x) {
|
384 | return (
|
385 | x
|
386 | // ensure the array is not changed in-place
|
387 | .slice()
|
388 | // comparator function that treats input as numeric
|
389 | .sort(function (a, b) {
|
390 | return a - b;
|
391 | })
|
392 | );
|
393 | }
|
394 |
|
395 | /**
|
396 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
397 | * that appears in a list the highest number of times.
|
398 | * There can be multiple modes in a list: in the event of a tie, this
|
399 | * algorithm will return the most recently seen mode.
|
400 | *
|
401 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
402 | * a method of finding a typical or central value of a set of numbers.
|
403 | *
|
404 | * This runs in `O(n log(n))` because it needs to sort the array internally
|
405 | * before running an `O(n)` search to find the mode.
|
406 | *
|
407 | * @param {Array<number>} x input
|
408 | * @returns {number} mode
|
409 | * @example
|
410 | * mode([0, 0, 1]); // => 0
|
411 | */
|
412 | function mode(x) {
|
413 | // Sorting the array lets us iterate through it below and be sure
|
414 | // that every time we see a new number it's new and we'll never
|
415 | // see the same number twice
|
416 | return modeSorted(numericSort(x));
|
417 | }
|
418 |
|
419 | /* globals Map: false */
|
420 |
|
421 | /**
|
422 | * The [mode](https://en.wikipedia.org/wiki/Mode_%28statistics%29) is the number
|
423 | * that appears in a list the highest number of times.
|
424 | * There can be multiple modes in a list: in the event of a tie, this
|
425 | * algorithm will return the most recently seen mode.
|
426 | *
|
427 | * modeFast uses a Map object to keep track of the mode, instead of the approach
|
428 | * used with `mode`, a sorted array. As a result, it is faster
|
429 | * than `mode` and supports any data type that can be compared with `==`.
|
430 | * It also requires a
|
431 | * [JavaScript environment with support for Map](https://kangax.github.io/compat-table/es6/#test-Map),
|
432 | * and will throw an error if Map is not available.
|
433 | *
|
434 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
435 | * a method of finding a typical or central value of a set of numbers.
|
436 | *
|
437 | * @param {Array<*>} x a sample of one or more data points
|
438 | * @returns {?*} mode
|
439 | * @throws {ReferenceError} if the JavaScript environment doesn't support Map
|
440 | * @throws {Error} if x is empty
|
441 | * @example
|
442 | * modeFast(['rabbits', 'rabbits', 'squirrels']); // => 'rabbits'
|
443 | */
|
444 | function modeFast(x) {
|
445 | // This index will reflect the incidence of different values, indexing
|
446 | // them like
|
447 | // { value: count }
|
448 | var index = new Map();
|
449 |
|
450 | // A running `mode` and the number of times it has been encountered.
|
451 | var mode;
|
452 | var modeCount = 0;
|
453 |
|
454 | for (var i = 0; i < x.length; i++) {
|
455 | var newCount = index.get(x[i]);
|
456 | if (newCount === undefined) {
|
457 | newCount = 1;
|
458 | } else {
|
459 | newCount++;
|
460 | }
|
461 | if (newCount > modeCount) {
|
462 | mode = x[i];
|
463 | modeCount = newCount;
|
464 | }
|
465 | index.set(x[i], newCount);
|
466 | }
|
467 |
|
468 | if (modeCount === 0) {
|
469 | throw new Error("mode requires at last one data point");
|
470 | }
|
471 |
|
472 | return mode;
|
473 | }
|
474 |
|
475 | /**
|
476 | * The min is the lowest number in the array.
|
477 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
478 | *
|
479 | * @param {Array<number>} x sample of one or more data points
|
480 | * @throws {Error} if the the length of x is less than one
|
481 | * @returns {number} minimum value
|
482 | * @example
|
483 | * min([1, 5, -10, 100, 2]); // => -10
|
484 | */
|
485 | function min(x) {
|
486 | if (x.length === 0) {
|
487 | throw new Error("min requires at least one data point");
|
488 | }
|
489 |
|
490 | var value = x[0];
|
491 | for (var i = 1; i < x.length; i++) {
|
492 | if (x[i] < value) {
|
493 | value = x[i];
|
494 | }
|
495 | }
|
496 | return value;
|
497 | }
|
498 |
|
499 | /**
|
500 | * This computes the maximum number in an array.
|
501 | *
|
502 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
503 | *
|
504 | * @param {Array<number>} x sample of one or more data points
|
505 | * @returns {number} maximum value
|
506 | * @throws {Error} if the the length of x is less than one
|
507 | * @example
|
508 | * max([1, 2, 3, 4]);
|
509 | * // => 4
|
510 | */
|
511 | function max(x) {
|
512 | if (x.length === 0) {
|
513 | throw new Error("max requires at least one data point");
|
514 | }
|
515 |
|
516 | var value = x[0];
|
517 | for (var i = 1; i < x.length; i++) {
|
518 | if (x[i] > value) {
|
519 | value = x[i];
|
520 | }
|
521 | }
|
522 | return value;
|
523 | }
|
524 |
|
525 | /**
|
526 | * This computes the minimum & maximum number in an array.
|
527 | *
|
528 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
529 | *
|
530 | * @param {Array<number>} x sample of one or more data points
|
531 | * @returns {Array<number>} minimum & maximum value
|
532 | * @throws {Error} if the the length of x is less than one
|
533 | * @example
|
534 | * extent([1, 2, 3, 4]);
|
535 | * // => [1, 4]
|
536 | */
|
537 | function extent(x) {
|
538 | if (x.length === 0) {
|
539 | throw new Error("extent requires at least one data point");
|
540 | }
|
541 |
|
542 | var min = x[0];
|
543 | var max = x[0];
|
544 | for (var i = 1; i < x.length; i++) {
|
545 | if (x[i] > max) {
|
546 | max = x[i];
|
547 | }
|
548 | if (x[i] < min) {
|
549 | min = x[i];
|
550 | }
|
551 | }
|
552 | return [min, max];
|
553 | }
|
554 |
|
555 | /**
|
556 | * The minimum is the lowest number in the array. With a sorted array,
|
557 | * the first element in the array is always the smallest, so this calculation
|
558 | * can be done in one step, or constant time.
|
559 | *
|
560 | * @param {Array<number>} x input
|
561 | * @returns {number} minimum value
|
562 | * @example
|
563 | * minSorted([-100, -10, 1, 2, 5]); // => -100
|
564 | */
|
565 | function minSorted(x) {
|
566 | return x[0];
|
567 | }
|
568 |
|
569 | /**
|
570 | * The maximum is the highest number in the array. With a sorted array,
|
571 | * the last element in the array is always the largest, so this calculation
|
572 | * can be done in one step, or constant time.
|
573 | *
|
574 | * @param {Array<number>} x input
|
575 | * @returns {number} maximum value
|
576 | * @example
|
577 | * maxSorted([-100, -10, 1, 2, 5]); // => 5
|
578 | */
|
579 | function maxSorted(x) {
|
580 | return x[x.length - 1];
|
581 | }
|
582 |
|
583 | /**
|
584 | * The extent is the lowest & highest number in the array. With a sorted array,
|
585 | * the first element in the array is always the lowest while the last element is always the largest, so this calculation
|
586 | * can be done in one step, or constant time.
|
587 | *
|
588 | * @param {Array<number>} x input
|
589 | * @returns {Array<number>} minimum & maximum value
|
590 | * @example
|
591 | * extentSorted([-100, -10, 1, 2, 5]); // => [-100, 5]
|
592 | */
|
593 | function extentSorted(x) {
|
594 | return [x[0], x[x.length - 1]];
|
595 | }
|
596 |
|
597 | /**
|
598 | * The simple [sum](https://en.wikipedia.org/wiki/Summation) of an array
|
599 | * is the result of adding all numbers together, starting from zero.
|
600 | *
|
601 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
602 | *
|
603 | * @param {Array<number>} x input
|
604 | * @return {number} sum of all input numbers
|
605 | * @example
|
606 | * sumSimple([1, 2, 3]); // => 6
|
607 | */
|
608 | function sumSimple(x) {
|
609 | var value = 0;
|
610 | for (var i = 0; i < x.length; i++) {
|
611 | value += x[i];
|
612 | }
|
613 | return value;
|
614 | }
|
615 |
|
616 | /**
|
617 | * The [product](https://en.wikipedia.org/wiki/Product_(mathematics)) of an array
|
618 | * is the result of multiplying all numbers together, starting using one as the multiplicative identity.
|
619 | *
|
620 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
621 | *
|
622 | * @param {Array<number>} x input
|
623 | * @return {number} product of all input numbers
|
624 | * @example
|
625 | * product([1, 2, 3, 4]); // => 24
|
626 | */
|
627 | function product(x) {
|
628 | var value = 1;
|
629 | for (var i = 0; i < x.length; i++) {
|
630 | value *= x[i];
|
631 | }
|
632 | return value;
|
633 | }
|
634 |
|
635 | /**
|
636 | * This is the internal implementation of quantiles: when you know
|
637 | * that the order is sorted, you don't need to re-sort it, and the computations
|
638 | * are faster.
|
639 | *
|
640 | * @param {Array<number>} x sample of one or more data points
|
641 | * @param {number} p desired quantile: a number between 0 to 1, inclusive
|
642 | * @returns {number} quantile value
|
643 | * @throws {Error} if p ix outside of the range from 0 to 1
|
644 | * @throws {Error} if x is empty
|
645 | * @example
|
646 | * quantileSorted([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
|
647 | */
|
648 | function quantileSorted(x, p) {
|
649 | var idx = x.length * p;
|
650 | if (x.length === 0) {
|
651 | throw new Error("quantile requires at least one data point.");
|
652 | } else if (p < 0 || p > 1) {
|
653 | throw new Error("quantiles must be between 0 and 1");
|
654 | } else if (p === 1) {
|
655 | // If p is 1, directly return the last element
|
656 | return x[x.length - 1];
|
657 | } else if (p === 0) {
|
658 | // If p is 0, directly return the first element
|
659 | return x[0];
|
660 | } else if (idx % 1 !== 0) {
|
661 | // If p is not integer, return the next element in array
|
662 | return x[Math.ceil(idx) - 1];
|
663 | } else if (x.length % 2 === 0) {
|
664 | // If the list has even-length, we'll take the average of this number
|
665 | // and the next value, if there is one
|
666 | return (x[idx - 1] + x[idx]) / 2;
|
667 | } else {
|
668 | // Finally, in the simple case of an integer value
|
669 | // with an odd-length list, return the x value at the index.
|
670 | return x[idx];
|
671 | }
|
672 | }
|
673 |
|
674 | /**
|
675 | * Rearrange items in `arr` so that all items in `[left, k]` range are the smallest.
|
676 | * The `k`-th element will have the `(k - left + 1)`-th smallest value in `[left, right]`.
|
677 | *
|
678 | * Implements Floyd-Rivest selection algorithm https://en.wikipedia.org/wiki/Floyd-Rivest_algorithm
|
679 | *
|
680 | * @param {Array<number>} arr input array
|
681 | * @param {number} k pivot index
|
682 | * @param {number} [left] left index
|
683 | * @param {number} [right] right index
|
684 | * @returns {void} mutates input array
|
685 | * @example
|
686 | * var arr = [65, 28, 59, 33, 21, 56, 22, 95, 50, 12, 90, 53, 28, 77, 39];
|
687 | * quickselect(arr, 8);
|
688 | * // = [39, 28, 28, 33, 21, 12, 22, 50, 53, 56, 59, 65, 90, 77, 95]
|
689 | */
|
690 | function quickselect(arr, k, left, right) {
|
691 | left = left || 0;
|
692 | right = right || arr.length - 1;
|
693 |
|
694 | while (right > left) {
|
695 | // 600 and 0.5 are arbitrary constants chosen in the original paper to minimize execution time
|
696 | if (right - left > 600) {
|
697 | var n = right - left + 1;
|
698 | var m = k - left + 1;
|
699 | var z = Math.log(n);
|
700 | var s = 0.5 * Math.exp((2 * z) / 3);
|
701 | var sd = 0.5 * Math.sqrt((z * s * (n - s)) / n);
|
702 | if (m - n / 2 < 0) { sd *= -1; }
|
703 | var newLeft = Math.max(left, Math.floor(k - (m * s) / n + sd));
|
704 | var newRight = Math.min(
|
705 | right,
|
706 | Math.floor(k + ((n - m) * s) / n + sd)
|
707 | );
|
708 | quickselect(arr, k, newLeft, newRight);
|
709 | }
|
710 |
|
711 | var t = arr[k];
|
712 | var i = left;
|
713 | var j = right;
|
714 |
|
715 | swap(arr, left, k);
|
716 | if (arr[right] > t) { swap(arr, left, right); }
|
717 |
|
718 | while (i < j) {
|
719 | swap(arr, i, j);
|
720 | i++;
|
721 | j--;
|
722 | while (arr[i] < t) { i++; }
|
723 | while (arr[j] > t) { j--; }
|
724 | }
|
725 |
|
726 | if (arr[left] === t) { swap(arr, left, j); }
|
727 | else {
|
728 | j++;
|
729 | swap(arr, j, right);
|
730 | }
|
731 |
|
732 | if (j <= k) { left = j + 1; }
|
733 | if (k <= j) { right = j - 1; }
|
734 | }
|
735 | }
|
736 |
|
737 | function swap(arr, i, j) {
|
738 | var tmp = arr[i];
|
739 | arr[i] = arr[j];
|
740 | arr[j] = tmp;
|
741 | }
|
742 |
|
743 | /**
|
744 | * The [quantile](https://en.wikipedia.org/wiki/Quantile):
|
745 | * this is a population quantile, since we assume to know the entire
|
746 | * dataset in this library. This is an implementation of the
|
747 | * [Quantiles of a Population](http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population)
|
748 | * algorithm from wikipedia.
|
749 | *
|
750 | * Sample is a one-dimensional array of numbers,
|
751 | * and p is either a decimal number from 0 to 1 or an array of decimal
|
752 | * numbers from 0 to 1.
|
753 | * In terms of a k/q quantile, p = k/q - it's just dealing with fractions or dealing
|
754 | * with decimal values.
|
755 | * When p is an array, the result of the function is also an array containing the appropriate
|
756 | * quantiles in input order
|
757 | *
|
758 | * @param {Array<number>} x sample of one or more numbers
|
759 | * @param {Array<number> | number} p the desired quantile, as a number between 0 and 1
|
760 | * @returns {number} quantile
|
761 | * @example
|
762 | * quantile([3, 6, 7, 8, 8, 9, 10, 13, 15, 16, 20], 0.5); // => 9
|
763 | */
|
764 | function quantile(x, p) {
|
765 | var copy = x.slice();
|
766 |
|
767 | if (Array.isArray(p)) {
|
768 | // rearrange elements so that each element corresponding to a requested
|
769 | // quantile is on a place it would be if the array was fully sorted
|
770 | multiQuantileSelect(copy, p);
|
771 | // Initialize the result array
|
772 | var results = [];
|
773 | // For each requested quantile
|
774 | for (var i = 0; i < p.length; i++) {
|
775 | results[i] = quantileSorted(copy, p[i]);
|
776 | }
|
777 | return results;
|
778 | } else {
|
779 | var idx = quantileIndex(copy.length, p);
|
780 | quantileSelect(copy, idx, 0, copy.length - 1);
|
781 | return quantileSorted(copy, p);
|
782 | }
|
783 | }
|
784 |
|
785 | function quantileSelect(arr, k, left, right) {
|
786 | if (k % 1 === 0) {
|
787 | quickselect(arr, k, left, right);
|
788 | } else {
|
789 | k = Math.floor(k);
|
790 | quickselect(arr, k, left, right);
|
791 | quickselect(arr, k + 1, k + 1, right);
|
792 | }
|
793 | }
|
794 |
|
795 | function multiQuantileSelect(arr, p) {
|
796 | var indices = [0];
|
797 | for (var i = 0; i < p.length; i++) {
|
798 | indices.push(quantileIndex(arr.length, p[i]));
|
799 | }
|
800 | indices.push(arr.length - 1);
|
801 | indices.sort(compare);
|
802 |
|
803 | var stack = [0, indices.length - 1];
|
804 |
|
805 | while (stack.length) {
|
806 | var r = Math.ceil(stack.pop());
|
807 | var l = Math.floor(stack.pop());
|
808 | if (r - l <= 1) { continue; }
|
809 |
|
810 | var m = Math.floor((l + r) / 2);
|
811 | quantileSelect(
|
812 | arr,
|
813 | indices[m],
|
814 | Math.floor(indices[l]),
|
815 | Math.ceil(indices[r])
|
816 | );
|
817 |
|
818 | stack.push(l, m, m, r);
|
819 | }
|
820 | }
|
821 |
|
822 | function compare(a, b) {
|
823 | return a - b;
|
824 | }
|
825 |
|
826 | function quantileIndex(len, p) {
|
827 | var idx = len * p;
|
828 | if (p === 1) {
|
829 | // If p is 1, directly return the last index
|
830 | return len - 1;
|
831 | } else if (p === 0) {
|
832 | // If p is 0, directly return the first index
|
833 | return 0;
|
834 | } else if (idx % 1 !== 0) {
|
835 | // If index is not integer, return the next index in array
|
836 | return Math.ceil(idx) - 1;
|
837 | } else if (len % 2 === 0) {
|
838 | // If the list has even-length, we'll return the middle of two indices
|
839 | // around quantile to indicate that we need an average value of the two
|
840 | return idx - 0.5;
|
841 | } else {
|
842 | // Finally, in the simple case of an integer index
|
843 | // with an odd-length list, return the index
|
844 | return idx;
|
845 | }
|
846 | }
|
847 |
|
848 | /* eslint no-bitwise: 0 */
|
849 |
|
850 | /**
|
851 | * This function returns the quantile in which one would find the given value in
|
852 | * the given array. With a sorted array, leveraging binary search, we can find
|
853 | * this information in logarithmic time.
|
854 | *
|
855 | * @param {Array<number>} x input
|
856 | * @returns {number} value value
|
857 | * @example
|
858 | * quantileRankSorted([1, 2, 3, 4], 3); // => 0.75
|
859 | * quantileRankSorted([1, 2, 3, 3, 4], 3); // => 0.7
|
860 | * quantileRankSorted([1, 2, 3, 4], 6); // => 1
|
861 | * quantileRankSorted([1, 2, 3, 3, 5], 4); // => 0.8
|
862 | */
|
863 | function quantileRankSorted(x, value) {
|
864 | // Value is lesser than any value in the array
|
865 | if (value < x[0]) {
|
866 | return 0;
|
867 | }
|
868 |
|
869 | // Value is greater than any value in the array
|
870 | if (value > x[x.length - 1]) {
|
871 | return 1;
|
872 | }
|
873 |
|
874 | var l = lowerBound(x, value);
|
875 |
|
876 | // Value is not in the array
|
877 | if (x[l] !== value) {
|
878 | return l / x.length;
|
879 | }
|
880 |
|
881 | l++;
|
882 |
|
883 | var u = upperBound(x, value);
|
884 |
|
885 | // The value exists only once in the array
|
886 | if (u === l) {
|
887 | return l / x.length;
|
888 | }
|
889 |
|
890 | // Here, we are basically computing the mean of the range of indices
|
891 | // containing our searched value. But, instead, of initializing an
|
892 | // array and looping over it, there is a dedicated math formula that
|
893 | // we apply below to get the result.
|
894 | var r = u - l + 1;
|
895 | var sum = (r * (u + l)) / 2;
|
896 | var mean = sum / r;
|
897 |
|
898 | return mean / x.length;
|
899 | }
|
900 |
|
901 | function lowerBound(x, value) {
|
902 | var mid = 0;
|
903 | var lo = 0;
|
904 | var hi = x.length;
|
905 |
|
906 | while (lo < hi) {
|
907 | mid = (lo + hi) >>> 1;
|
908 |
|
909 | if (value <= x[mid]) {
|
910 | hi = mid;
|
911 | } else {
|
912 | lo = -~mid;
|
913 | }
|
914 | }
|
915 |
|
916 | return lo;
|
917 | }
|
918 |
|
919 | function upperBound(x, value) {
|
920 | var mid = 0;
|
921 | var lo = 0;
|
922 | var hi = x.length;
|
923 |
|
924 | while (lo < hi) {
|
925 | mid = (lo + hi) >>> 1;
|
926 |
|
927 | if (value >= x[mid]) {
|
928 | lo = -~mid;
|
929 | } else {
|
930 | hi = mid;
|
931 | }
|
932 | }
|
933 |
|
934 | return lo;
|
935 | }
|
936 |
|
937 | /**
|
938 | * This function returns the quantile in which one would find the given value in
|
939 | * the given array. It will copy and sort your array before each run, so
|
940 | * if you know your array is already sorted, you should use `quantileRankSorted`
|
941 | * instead.
|
942 | *
|
943 | * @param {Array<number>} x input
|
944 | * @returns {number} value value
|
945 | * @example
|
946 | * quantileRank([4, 3, 1, 2], 3); // => 0.75
|
947 | * quantileRank([4, 3, 2, 3, 1], 3); // => 0.7
|
948 | * quantileRank([2, 4, 1, 3], 6); // => 1
|
949 | * quantileRank([5, 3, 1, 2, 3], 4); // => 0.8
|
950 | */
|
951 | function quantileRank(x, value) {
|
952 | // Cloning and sorting the array
|
953 | var sortedCopy = numericSort(x);
|
954 |
|
955 | return quantileRankSorted(sortedCopy, value);
|
956 | }
|
957 |
|
958 | /**
|
959 | * The [Interquartile range](http://en.wikipedia.org/wiki/Interquartile_range) is
|
960 | * a measure of statistical dispersion, or how scattered, spread, or
|
961 | * concentrated a distribution is. It's computed as the difference between
|
962 | * the third quartile and first quartile.
|
963 | *
|
964 | * @param {Array<number>} x sample of one or more numbers
|
965 | * @returns {number} interquartile range: the span between lower and upper quartile,
|
966 | * 0.25 and 0.75
|
967 | * @example
|
968 | * interquartileRange([0, 1, 2, 3]); // => 2
|
969 | */
|
970 | function interquartileRange(x) {
|
971 | // Interquartile range is the span between the upper quartile,
|
972 | // at `0.75`, and lower quartile, `0.25`
|
973 | var q1 = quantile(x, 0.75);
|
974 | var q2 = quantile(x, 0.25);
|
975 |
|
976 | if (typeof q1 === "number" && typeof q2 === "number") {
|
977 | return q1 - q2;
|
978 | }
|
979 | }
|
980 |
|
981 | /**
|
982 | * The [median](http://en.wikipedia.org/wiki/Median) is
|
983 | * the middle number of a list. This is often a good indicator of 'the middle'
|
984 | * when there are outliers that skew the `mean()` value.
|
985 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
986 | * a method of finding a typical or central value of a set of numbers.
|
987 | *
|
988 | * The median isn't necessarily one of the elements in the list: the value
|
989 | * can be the average of two elements if the list has an even length
|
990 | * and the two central values are different.
|
991 | *
|
992 | * @param {Array<number>} x input
|
993 | * @returns {number} median value
|
994 | * @example
|
995 | * median([10, 2, 5, 100, 2, 1]); // => 3.5
|
996 | */
|
997 | function median(x) {
|
998 | return +quantile(x, 0.5);
|
999 | }
|
1000 |
|
1001 | /**
|
1002 | * The [Median Absolute Deviation](http://en.wikipedia.org/wiki/Median_absolute_deviation) is
|
1003 | * a robust measure of statistical
|
1004 | * dispersion. It is more resilient to outliers than the standard deviation.
|
1005 | *
|
1006 | * @param {Array<number>} x input array
|
1007 | * @returns {number} median absolute deviation
|
1008 | * @example
|
1009 | * medianAbsoluteDeviation([1, 1, 2, 2, 4, 6, 9]); // => 1
|
1010 | */
|
1011 | function medianAbsoluteDeviation(x) {
|
1012 | var medianValue = median(x);
|
1013 | var medianAbsoluteDeviations = [];
|
1014 |
|
1015 | // Make a list of absolute deviations from the median
|
1016 | for (var i = 0; i < x.length; i++) {
|
1017 | medianAbsoluteDeviations.push(Math.abs(x[i] - medianValue));
|
1018 | }
|
1019 |
|
1020 | // Find the median value of that list
|
1021 | return median(medianAbsoluteDeviations);
|
1022 | }
|
1023 |
|
1024 | /**
|
1025 | * Split an array into chunks of a specified size. This function
|
1026 | * has the same behavior as [PHP's array_chunk](http://php.net/manual/en/function.array-chunk.php)
|
1027 | * function, and thus will insert smaller-sized chunks at the end if
|
1028 | * the input size is not divisible by the chunk size.
|
1029 | *
|
1030 | * `x` is expected to be an array, and `chunkSize` a number.
|
1031 | * The `x` array can contain any kind of data.
|
1032 | *
|
1033 | * @param {Array} x a sample
|
1034 | * @param {number} chunkSize size of each output array. must be a positive integer
|
1035 | * @returns {Array<Array>} a chunked array
|
1036 | * @throws {Error} if chunk size is less than 1 or not an integer
|
1037 | * @example
|
1038 | * chunk([1, 2, 3, 4, 5, 6], 2);
|
1039 | * // => [[1, 2], [3, 4], [5, 6]]
|
1040 | */
|
1041 | function chunk(x, chunkSize) {
|
1042 | // a list of result chunks, as arrays in an array
|
1043 | var output = [];
|
1044 |
|
1045 | // `chunkSize` must be zero or higher - otherwise the loop below,
|
1046 | // in which we call `start += chunkSize`, will loop infinitely.
|
1047 | // So, we'll detect and throw in that case to indicate
|
1048 | // invalid input.
|
1049 | if (chunkSize < 1) {
|
1050 | throw new Error("chunk size must be a positive number");
|
1051 | }
|
1052 |
|
1053 | if (Math.floor(chunkSize) !== chunkSize) {
|
1054 | throw new Error("chunk size must be an integer");
|
1055 | }
|
1056 |
|
1057 | // `start` is the index at which `.slice` will start selecting
|
1058 | // new array elements
|
1059 | for (var start = 0; start < x.length; start += chunkSize) {
|
1060 | // for each chunk, slice that part of the array and add it
|
1061 | // to the output. The `.slice` function does not change
|
1062 | // the original array.
|
1063 | output.push(x.slice(start, start + chunkSize));
|
1064 | }
|
1065 | return output;
|
1066 | }
|
1067 |
|
1068 | /**
|
1069 | * Sampling with replacement is a type of sampling that allows the same
|
1070 | * item to be picked out of a population more than once.
|
1071 | *
|
1072 | * @param {Array<*>} x an array of any kind of value
|
1073 | * @param {number} n count of how many elements to take
|
1074 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1075 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1076 | * @return {Array} n sampled items from the population
|
1077 | * @example
|
1078 | * var values = [1, 2, 3, 4];
|
1079 | * sampleWithReplacement(values, 2); // returns 2 random values, like [2, 4];
|
1080 | */
|
1081 | function sampleWithReplacement(x, n, randomSource) {
|
1082 | if (x.length === 0) {
|
1083 | return [];
|
1084 | }
|
1085 |
|
1086 | // a custom random number source can be provided if you want to use
|
1087 | // a fixed seed or another random number generator, like
|
1088 | // [random-js](https://www.npmjs.org/package/random-js)
|
1089 | randomSource = randomSource || Math.random;
|
1090 |
|
1091 | var length = x.length;
|
1092 | var sample = [];
|
1093 |
|
1094 | for (var i = 0; i < n; i++) {
|
1095 | var index = Math.floor(randomSource() * length);
|
1096 |
|
1097 | sample.push(x[index]);
|
1098 | }
|
1099 |
|
1100 | return sample;
|
1101 | }
|
1102 |
|
1103 | /**
|
1104 | * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
|
1105 | * in-place - which means that it **will change the order of the original
|
1106 | * array by reference**.
|
1107 | *
|
1108 | * This is an algorithm that generates a random [permutation](https://en.wikipedia.org/wiki/Permutation)
|
1109 | * of a set.
|
1110 | *
|
1111 | * @param {Array} x sample of one or more numbers
|
1112 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1113 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1114 | * @returns {Array} x
|
1115 | * @example
|
1116 | * var x = [1, 2, 3, 4];
|
1117 | * shuffleInPlace(x);
|
1118 | * // x is shuffled to a value like [2, 1, 4, 3]
|
1119 | */
|
1120 | function shuffleInPlace(x, randomSource) {
|
1121 | // a custom random number source can be provided if you want to use
|
1122 | // a fixed seed or another random number generator, like
|
1123 | // [random-js](https://www.npmjs.org/package/random-js)
|
1124 | randomSource = randomSource || Math.random;
|
1125 |
|
1126 | // store the current length of the x to determine
|
1127 | // when no elements remain to shuffle.
|
1128 | var length = x.length;
|
1129 |
|
1130 | // temporary is used to hold an item when it is being
|
1131 | // swapped between indices.
|
1132 | var temporary;
|
1133 |
|
1134 | // The index to swap at each stage.
|
1135 | var index;
|
1136 |
|
1137 | // While there are still items to shuffle
|
1138 | while (length > 0) {
|
1139 | // chose a random index within the subset of the array
|
1140 | // that is not yet shuffled
|
1141 | index = Math.floor(randomSource() * length--);
|
1142 |
|
1143 | // store the value that we'll move temporarily
|
1144 | temporary = x[length];
|
1145 |
|
1146 | // swap the value at `x[length]` with `x[index]`
|
1147 | x[length] = x[index];
|
1148 | x[index] = temporary;
|
1149 | }
|
1150 |
|
1151 | return x;
|
1152 | }
|
1153 |
|
1154 | /**
|
1155 | * A [Fisher-Yates shuffle](http://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle)
|
1156 | * is a fast way to create a random permutation of a finite set. This is
|
1157 | * a function around `shuffle_in_place` that adds the guarantee that
|
1158 | * it will not modify its input.
|
1159 | *
|
1160 | * @param {Array} x sample of 0 or more numbers
|
1161 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1162 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1163 | * @return {Array} shuffled version of input
|
1164 | * @example
|
1165 | * var shuffled = shuffle([1, 2, 3, 4]);
|
1166 | * shuffled; // = [2, 3, 1, 4] or any other random permutation
|
1167 | */
|
1168 | function shuffle(x, randomSource) {
|
1169 | // slice the original array so that it is not modified
|
1170 | var sample = x.slice();
|
1171 |
|
1172 | // and then shuffle that shallow-copied array, in place
|
1173 | return shuffleInPlace(sample, randomSource);
|
1174 | }
|
1175 |
|
1176 | /**
|
1177 | * Create a [simple random sample](http://en.wikipedia.org/wiki/Simple_random_sample)
|
1178 | * from a given array of `n` elements.
|
1179 | *
|
1180 | * The sampled values will be in any order, not necessarily the order
|
1181 | * they appear in the input.
|
1182 | *
|
1183 | * @param {Array<any>} x input array. can contain any type
|
1184 | * @param {number} n count of how many elements to take
|
1185 | * @param {Function} [randomSource=Math.random] an optional entropy source that
|
1186 | * returns numbers between 0 inclusive and 1 exclusive: the range [0, 1)
|
1187 | * @return {Array} subset of n elements in original array
|
1188 | *
|
1189 | * @example
|
1190 | * var values = [1, 2, 4, 5, 6, 7, 8, 9];
|
1191 | * sample(values, 3); // returns 3 random values, like [2, 5, 8];
|
1192 | */
|
1193 | function sample(x, n, randomSource) {
|
1194 | // shuffle the original array using a fisher-yates shuffle
|
1195 | var shuffled = shuffle(x, randomSource);
|
1196 |
|
1197 | // and then return a subset of it - the first `n` elements.
|
1198 | return shuffled.slice(0, n);
|
1199 | }
|
1200 |
|
1201 | /**
|
1202 | * Create a new column x row matrix.
|
1203 | *
|
1204 | * @private
|
1205 | * @param {number} columns
|
1206 | * @param {number} rows
|
1207 | * @return {Array<Array<number>>} matrix
|
1208 | * @example
|
1209 | * makeMatrix(10, 10);
|
1210 | */
|
1211 | function makeMatrix(columns, rows) {
|
1212 | var matrix = [];
|
1213 | for (var i = 0; i < columns; i++) {
|
1214 | var column = [];
|
1215 | for (var j = 0; j < rows; j++) {
|
1216 | column.push(0);
|
1217 | }
|
1218 | matrix.push(column);
|
1219 | }
|
1220 | return matrix;
|
1221 | }
|
1222 |
|
1223 | /**
|
1224 | * For a sorted input, counting the number of unique values
|
1225 | * is possible in constant time and constant memory. This is
|
1226 | * a simple implementation of the algorithm.
|
1227 | *
|
1228 | * Values are compared with `===`, so objects and non-primitive objects
|
1229 | * are not handled in any special way.
|
1230 | *
|
1231 | * @param {Array<*>} x an array of any kind of value
|
1232 | * @returns {number} count of unique values
|
1233 | * @example
|
1234 | * uniqueCountSorted([1, 2, 3]); // => 3
|
1235 | * uniqueCountSorted([1, 1, 1]); // => 1
|
1236 | */
|
1237 | function uniqueCountSorted(x) {
|
1238 | var uniqueValueCount = 0,
|
1239 | lastSeenValue;
|
1240 | for (var i = 0; i < x.length; i++) {
|
1241 | if (i === 0 || x[i] !== lastSeenValue) {
|
1242 | lastSeenValue = x[i];
|
1243 | uniqueValueCount++;
|
1244 | }
|
1245 | }
|
1246 | return uniqueValueCount;
|
1247 | }
|
1248 |
|
1249 | /**
|
1250 | * Generates incrementally computed values based on the sums and sums of
|
1251 | * squares for the data array
|
1252 | *
|
1253 | * @private
|
1254 | * @param {number} j
|
1255 | * @param {number} i
|
1256 | * @param {Array<number>} sums
|
1257 | * @param {Array<number>} sumsOfSquares
|
1258 | * @return {number}
|
1259 | * @example
|
1260 | * ssq(0, 1, [-1, 0, 2], [1, 1, 5]);
|
1261 | */
|
1262 | function ssq(j, i, sums, sumsOfSquares) {
|
1263 | var sji; // s(j, i)
|
1264 | if (j > 0) {
|
1265 | var muji = (sums[i] - sums[j - 1]) / (i - j + 1); // mu(j, i)
|
1266 | sji =
|
1267 | sumsOfSquares[i] - sumsOfSquares[j - 1] - (i - j + 1) * muji * muji;
|
1268 | } else {
|
1269 | sji = sumsOfSquares[i] - (sums[i] * sums[i]) / (i + 1);
|
1270 | }
|
1271 | if (sji < 0) {
|
1272 | return 0;
|
1273 | }
|
1274 | return sji;
|
1275 | }
|
1276 |
|
1277 | /**
|
1278 | * Function that recursively divides and conquers computations
|
1279 | * for cluster j
|
1280 | *
|
1281 | * @private
|
1282 | * @param {number} iMin Minimum index in cluster to be computed
|
1283 | * @param {number} iMax Maximum index in cluster to be computed
|
1284 | * @param {number} cluster Index of the cluster currently being computed
|
1285 | * @param {Array<Array<number>>} matrix
|
1286 | * @param {Array<Array<number>>} backtrackMatrix
|
1287 | * @param {Array<number>} sums
|
1288 | * @param {Array<number>} sumsOfSquares
|
1289 | */
|
1290 | function fillMatrixColumn(
|
1291 | iMin,
|
1292 | iMax,
|
1293 | cluster,
|
1294 | matrix,
|
1295 | backtrackMatrix,
|
1296 | sums,
|
1297 | sumsOfSquares
|
1298 | ) {
|
1299 | if (iMin > iMax) {
|
1300 | return;
|
1301 | }
|
1302 |
|
1303 | // Start at midpoint between iMin and iMax
|
1304 | var i = Math.floor((iMin + iMax) / 2);
|
1305 |
|
1306 | matrix[cluster][i] = matrix[cluster - 1][i - 1];
|
1307 | backtrackMatrix[cluster][i] = i;
|
1308 |
|
1309 | var jlow = cluster; // the lower end for j
|
1310 |
|
1311 | if (iMin > cluster) {
|
1312 | jlow = Math.max(jlow, backtrackMatrix[cluster][iMin - 1] || 0);
|
1313 | }
|
1314 | jlow = Math.max(jlow, backtrackMatrix[cluster - 1][i] || 0);
|
1315 |
|
1316 | var jhigh = i - 1; // the upper end for j
|
1317 | if (iMax < matrix[0].length - 1) {
|
1318 | jhigh = Math.min(jhigh, backtrackMatrix[cluster][iMax + 1] || 0);
|
1319 | }
|
1320 |
|
1321 | var sji;
|
1322 | var sjlowi;
|
1323 | var ssqjlow;
|
1324 | var ssqj;
|
1325 | for (var j = jhigh; j >= jlow; --j) {
|
1326 | sji = ssq(j, i, sums, sumsOfSquares);
|
1327 |
|
1328 | if (sji + matrix[cluster - 1][jlow - 1] >= matrix[cluster][i]) {
|
1329 | break;
|
1330 | }
|
1331 |
|
1332 | // Examine the lower bound of the cluster border
|
1333 | sjlowi = ssq(jlow, i, sums, sumsOfSquares);
|
1334 |
|
1335 | ssqjlow = sjlowi + matrix[cluster - 1][jlow - 1];
|
1336 |
|
1337 | if (ssqjlow < matrix[cluster][i]) {
|
1338 | // Shrink the lower bound
|
1339 | matrix[cluster][i] = ssqjlow;
|
1340 | backtrackMatrix[cluster][i] = jlow;
|
1341 | }
|
1342 | jlow++;
|
1343 |
|
1344 | ssqj = sji + matrix[cluster - 1][j - 1];
|
1345 | if (ssqj < matrix[cluster][i]) {
|
1346 | matrix[cluster][i] = ssqj;
|
1347 | backtrackMatrix[cluster][i] = j;
|
1348 | }
|
1349 | }
|
1350 |
|
1351 | fillMatrixColumn(
|
1352 | iMin,
|
1353 | i - 1,
|
1354 | cluster,
|
1355 | matrix,
|
1356 | backtrackMatrix,
|
1357 | sums,
|
1358 | sumsOfSquares
|
1359 | );
|
1360 | fillMatrixColumn(
|
1361 | i + 1,
|
1362 | iMax,
|
1363 | cluster,
|
1364 | matrix,
|
1365 | backtrackMatrix,
|
1366 | sums,
|
1367 | sumsOfSquares
|
1368 | );
|
1369 | }
|
1370 |
|
1371 | /**
|
1372 | * Initializes the main matrices used in Ckmeans and kicks
|
1373 | * off the divide and conquer cluster computation strategy
|
1374 | *
|
1375 | * @private
|
1376 | * @param {Array<number>} data sorted array of values
|
1377 | * @param {Array<Array<number>>} matrix
|
1378 | * @param {Array<Array<number>>} backtrackMatrix
|
1379 | */
|
1380 | function fillMatrices(data, matrix, backtrackMatrix) {
|
1381 | var nValues = matrix[0].length;
|
1382 |
|
1383 | // Shift values by the median to improve numeric stability
|
1384 | var shift = data[Math.floor(nValues / 2)];
|
1385 |
|
1386 | // Cumulative sum and cumulative sum of squares for all values in data array
|
1387 | var sums = [];
|
1388 | var sumsOfSquares = [];
|
1389 |
|
1390 | // Initialize first column in matrix & backtrackMatrix
|
1391 | for (var i = 0, shiftedValue = (void 0); i < nValues; ++i) {
|
1392 | shiftedValue = data[i] - shift;
|
1393 | if (i === 0) {
|
1394 | sums.push(shiftedValue);
|
1395 | sumsOfSquares.push(shiftedValue * shiftedValue);
|
1396 | } else {
|
1397 | sums.push(sums[i - 1] + shiftedValue);
|
1398 | sumsOfSquares.push(
|
1399 | sumsOfSquares[i - 1] + shiftedValue * shiftedValue
|
1400 | );
|
1401 | }
|
1402 |
|
1403 | // Initialize for cluster = 0
|
1404 | matrix[0][i] = ssq(0, i, sums, sumsOfSquares);
|
1405 | backtrackMatrix[0][i] = 0;
|
1406 | }
|
1407 |
|
1408 | // Initialize the rest of the columns
|
1409 | var iMin;
|
1410 | for (var cluster = 1; cluster < matrix.length; ++cluster) {
|
1411 | if (cluster < matrix.length - 1) {
|
1412 | iMin = cluster;
|
1413 | } else {
|
1414 | // No need to compute matrix[K-1][0] ... matrix[K-1][N-2]
|
1415 | iMin = nValues - 1;
|
1416 | }
|
1417 |
|
1418 | fillMatrixColumn(
|
1419 | iMin,
|
1420 | nValues - 1,
|
1421 | cluster,
|
1422 | matrix,
|
1423 | backtrackMatrix,
|
1424 | sums,
|
1425 | sumsOfSquares
|
1426 | );
|
1427 | }
|
1428 | }
|
1429 |
|
1430 | /**
|
1431 | * Ckmeans clustering is an improvement on heuristic-based clustering
|
1432 | * approaches like Jenks. The algorithm was developed in
|
1433 | * [Haizhou Wang and Mingzhou Song](http://journal.r-project.org/archive/2011-2/RJournal_2011-2_Wang+Song.pdf)
|
1434 | * as a [dynamic programming](https://en.wikipedia.org/wiki/Dynamic_programming) approach
|
1435 | * to the problem of clustering numeric data into groups with the least
|
1436 | * within-group sum-of-squared-deviations.
|
1437 | *
|
1438 | * Minimizing the difference within groups - what Wang & Song refer to as
|
1439 | * `withinss`, or within sum-of-squares, means that groups are optimally
|
1440 | * homogenous within and the data is split into representative groups.
|
1441 | * This is very useful for visualization, where you may want to represent
|
1442 | * a continuous variable in discrete color or style groups. This function
|
1443 | * can provide groups that emphasize differences between data.
|
1444 | *
|
1445 | * Being a dynamic approach, this algorithm is based on two matrices that
|
1446 | * store incrementally-computed values for squared deviations and backtracking
|
1447 | * indexes.
|
1448 | *
|
1449 | * This implementation is based on Ckmeans 3.4.6, which introduced a new divide
|
1450 | * and conquer approach that improved runtime from O(kn^2) to O(kn log(n)).
|
1451 | *
|
1452 | * Unlike the [original implementation](https://cran.r-project.org/web/packages/Ckmeans.1d.dp/index.html),
|
1453 | * this implementation does not include any code to automatically determine
|
1454 | * the optimal number of clusters: this information needs to be explicitly
|
1455 | * provided.
|
1456 | *
|
1457 | * ### References
|
1458 | * _Ckmeans.1d.dp: Optimal k-means Clustering in One Dimension by Dynamic
|
1459 | * Programming_ Haizhou Wang and Mingzhou Song ISSN 2073-4859
|
1460 | *
|
1461 | * from The R Journal Vol. 3/2, December 2011
|
1462 | * @param {Array<number>} x input data, as an array of number values
|
1463 | * @param {number} nClusters number of desired classes. This cannot be
|
1464 | * greater than the number of values in the data array.
|
1465 | * @returns {Array<Array<number>>} clustered input
|
1466 | * @throws {Error} if the number of requested clusters is higher than the size of the data
|
1467 | * @example
|
1468 | * ckmeans([-1, 2, -1, 2, 4, 5, 6, -1, 2, -1], 3);
|
1469 | * // The input, clustered into groups of similar numbers.
|
1470 | * //= [[-1, -1, -1, -1], [2, 2, 2], [4, 5, 6]]);
|
1471 | */
|
1472 | function ckmeans(x, nClusters) {
|
1473 | if (nClusters > x.length) {
|
1474 | throw new Error(
|
1475 | "cannot generate more classes than there are data values"
|
1476 | );
|
1477 | }
|
1478 |
|
1479 | var sorted = numericSort(x);
|
1480 | // we'll use this as the maximum number of clusters
|
1481 | var uniqueCount = uniqueCountSorted(sorted);
|
1482 |
|
1483 | // if all of the input values are identical, there's one cluster
|
1484 | // with all of the input in it.
|
1485 | if (uniqueCount === 1) {
|
1486 | return [sorted];
|
1487 | }
|
1488 |
|
1489 | // named 'S' originally
|
1490 | var matrix = makeMatrix(nClusters, sorted.length);
|
1491 | // named 'J' originally
|
1492 | var backtrackMatrix = makeMatrix(nClusters, sorted.length);
|
1493 |
|
1494 | // This is a dynamic programming way to solve the problem of minimizing
|
1495 | // within-cluster sum of squares. It's similar to linear regression
|
1496 | // in this way, and this calculation incrementally computes the
|
1497 | // sum of squares that are later read.
|
1498 | fillMatrices(sorted, matrix, backtrackMatrix);
|
1499 |
|
1500 | // The real work of Ckmeans clustering happens in the matrix generation:
|
1501 | // the generated matrices encode all possible clustering combinations, and
|
1502 | // once they're generated we can solve for the best clustering groups
|
1503 | // very quickly.
|
1504 | var clusters = [];
|
1505 | var clusterRight = backtrackMatrix[0].length - 1;
|
1506 |
|
1507 | // Backtrack the clusters from the dynamic programming matrix. This
|
1508 | // starts at the bottom-right corner of the matrix (if the top-left is 0, 0),
|
1509 | // and moves the cluster target with the loop.
|
1510 | for (var cluster = backtrackMatrix.length - 1; cluster >= 0; cluster--) {
|
1511 | var clusterLeft = backtrackMatrix[cluster][clusterRight];
|
1512 |
|
1513 | // fill the cluster from the sorted input by taking a slice of the
|
1514 | // array. the backtrack matrix makes this easy - it stores the
|
1515 | // indexes where the cluster should start and end.
|
1516 | clusters[cluster] = sorted.slice(clusterLeft, clusterRight + 1);
|
1517 |
|
1518 | if (cluster > 0) {
|
1519 | clusterRight = clusterLeft - 1;
|
1520 | }
|
1521 | }
|
1522 |
|
1523 | return clusters;
|
1524 | }
|
1525 |
|
1526 | /**
|
1527 | * Given an array of x, this will find the extent of the
|
1528 | * x and return an array of breaks that can be used
|
1529 | * to categorize the x into a number of classes. The
|
1530 | * returned array will always be 1 longer than the number of
|
1531 | * classes because it includes the minimum value.
|
1532 | *
|
1533 | * @param {Array<number>} x an array of number values
|
1534 | * @param {number} nClasses number of desired classes
|
1535 | * @returns {Array<number>} array of class break positions
|
1536 | * @example
|
1537 | * equalIntervalBreaks([1, 2, 3, 4, 5, 6], 4); // => [1, 2.25, 3.5, 4.75, 6]
|
1538 | */
|
1539 | function equalIntervalBreaks(x, nClasses) {
|
1540 | if (x.length < 2) {
|
1541 | return x;
|
1542 | }
|
1543 |
|
1544 | var theMin = min(x);
|
1545 | var theMax = max(x);
|
1546 |
|
1547 | // the first break will always be the minimum value
|
1548 | // in the xset
|
1549 | var breaks = [theMin];
|
1550 |
|
1551 | // The size of each break is the full range of the x
|
1552 | // divided by the number of classes requested
|
1553 | var breakSize = (theMax - theMin) / nClasses;
|
1554 |
|
1555 | // In the case of nClasses = 1, this loop won't run
|
1556 | // and the returned breaks will be [min, max]
|
1557 | for (var i = 1; i < nClasses; i++) {
|
1558 | breaks.push(breaks[0] + breakSize * i);
|
1559 | }
|
1560 |
|
1561 | // the last break will always be the
|
1562 | // maximum.
|
1563 | breaks.push(theMax);
|
1564 |
|
1565 | return breaks;
|
1566 | }
|
1567 |
|
1568 | /**
|
1569 | * [Sample covariance](https://en.wikipedia.org/wiki/Sample_mean_and_covariance) of two datasets:
|
1570 | * how much do the two datasets move together?
|
1571 | * x and y are two datasets, represented as arrays of numbers.
|
1572 | *
|
1573 | * @param {Array<number>} x a sample of two or more data points
|
1574 | * @param {Array<number>} y a sample of two or more data points
|
1575 | * @throws {Error} if x and y do not have equal lengths
|
1576 | * @throws {Error} if x or y have length of one or less
|
1577 | * @returns {number} sample covariance
|
1578 | * @example
|
1579 | * sampleCovariance([1, 2, 3, 4, 5, 6], [6, 5, 4, 3, 2, 1]); // => -3.5
|
1580 | */
|
1581 | function sampleCovariance(x, y) {
|
1582 | // The two datasets must have the same length which must be more than 1
|
1583 | if (x.length !== y.length) {
|
1584 | throw new Error("sampleCovariance requires samples with equal lengths");
|
1585 | }
|
1586 |
|
1587 | if (x.length < 2) {
|
1588 | throw new Error(
|
1589 | "sampleCovariance requires at least two data points in each sample"
|
1590 | );
|
1591 | }
|
1592 |
|
1593 | // determine the mean of each dataset so that we can judge each
|
1594 | // value of the dataset fairly as the difference from the mean. this
|
1595 | // way, if one dataset is [1, 2, 3] and [2, 3, 4], their covariance
|
1596 | // does not suffer because of the difference in absolute values
|
1597 | var xmean = mean(x);
|
1598 | var ymean = mean(y);
|
1599 | var sum = 0;
|
1600 |
|
1601 | // for each pair of values, the covariance increases when their
|
1602 | // difference from the mean is associated - if both are well above
|
1603 | // or if both are well below
|
1604 | // the mean, the covariance increases significantly.
|
1605 | for (var i = 0; i < x.length; i++) {
|
1606 | sum += (x[i] - xmean) * (y[i] - ymean);
|
1607 | }
|
1608 |
|
1609 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1610 | // that allows for the reduced degree of freedom entailed in calculating
|
1611 | // values from samples rather than complete populations.
|
1612 | var besselsCorrection = x.length - 1;
|
1613 |
|
1614 | // the covariance is weighted by the length of the datasets.
|
1615 | return sum / besselsCorrection;
|
1616 | }
|
1617 |
|
1618 | /**
|
1619 | * The [sample variance](https://en.wikipedia.org/wiki/Variance#Sample_variance)
|
1620 | * is the sum of squared deviations from the mean. The sample variance
|
1621 | * is distinguished from the variance by the usage of [Bessel's Correction](https://en.wikipedia.org/wiki/Bessel's_correction):
|
1622 | * instead of dividing the sum of squared deviations by the length of the input,
|
1623 | * it is divided by the length minus one. This corrects the bias in estimating
|
1624 | * a value from a set that you don't know if full.
|
1625 | *
|
1626 | * References:
|
1627 | * * [Wolfram MathWorld on Sample Variance](http://mathworld.wolfram.com/SampleVariance.html)
|
1628 | *
|
1629 | * @param {Array<number>} x a sample of two or more data points
|
1630 | * @throws {Error} if the length of x is less than 2
|
1631 | * @return {number} sample variance
|
1632 | * @example
|
1633 | * sampleVariance([1, 2, 3, 4, 5]); // => 2.5
|
1634 | */
|
1635 | function sampleVariance(x) {
|
1636 | if (x.length < 2) {
|
1637 | throw new Error("sampleVariance requires at least two data points");
|
1638 | }
|
1639 |
|
1640 | var sumSquaredDeviationsValue = sumNthPowerDeviations(x, 2);
|
1641 |
|
1642 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1643 | // that allows for the reduced degree of freedom entailed in calculating
|
1644 | // values from samples rather than complete populations.
|
1645 | var besselsCorrection = x.length - 1;
|
1646 |
|
1647 | // Find the mean value of that list
|
1648 | return sumSquaredDeviationsValue / besselsCorrection;
|
1649 | }
|
1650 |
|
1651 | /**
|
1652 | * The [sample standard deviation](http://en.wikipedia.org/wiki/Standard_deviation#Sample_standard_deviation)
|
1653 | * is the square root of the sample variance.
|
1654 | *
|
1655 | * @param {Array<number>} x input array
|
1656 | * @returns {number} sample standard deviation
|
1657 | * @example
|
1658 | * sampleStandardDeviation([2, 4, 4, 4, 5, 5, 7, 9]).toFixed(2);
|
1659 | * // => '2.14'
|
1660 | */
|
1661 | function sampleStandardDeviation(x) {
|
1662 | var sampleVarianceX = sampleVariance(x);
|
1663 | return Math.sqrt(sampleVarianceX);
|
1664 | }
|
1665 |
|
1666 | /**
|
1667 | * The [correlation](http://en.wikipedia.org/wiki/Correlation_and_dependence) is
|
1668 | * a measure of how correlated two datasets are, between -1 and 1
|
1669 | *
|
1670 | * @param {Array<number>} x first input
|
1671 | * @param {Array<number>} y second input
|
1672 | * @returns {number} sample correlation
|
1673 | * @example
|
1674 | * sampleCorrelation([1, 2, 3, 4, 5, 6], [2, 2, 3, 4, 5, 60]).toFixed(2);
|
1675 | * // => '0.69'
|
1676 | */
|
1677 | function sampleCorrelation(x, y) {
|
1678 | var cov = sampleCovariance(x, y);
|
1679 | var xstd = sampleStandardDeviation(x);
|
1680 | var ystd = sampleStandardDeviation(y);
|
1681 |
|
1682 | return cov / xstd / ystd;
|
1683 | }
|
1684 |
|
1685 | /**
|
1686 | * [Skewness](http://en.wikipedia.org/wiki/Skewness) is
|
1687 | * a measure of the extent to which a probability distribution of a
|
1688 | * real-valued random variable "leans" to one side of the mean.
|
1689 | * The skewness value can be positive or negative, or even undefined.
|
1690 | *
|
1691 | * Implementation is based on the adjusted Fisher-Pearson standardized
|
1692 | * moment coefficient, which is the version found in Excel and several
|
1693 | * statistical packages including Minitab, SAS and SPSS.
|
1694 | *
|
1695 | * @since 4.1.0
|
1696 | * @param {Array<number>} x a sample of 3 or more data points
|
1697 | * @returns {number} sample skewness
|
1698 | * @throws {Error} if x has length less than 3
|
1699 | * @example
|
1700 | * sampleSkewness([2, 4, 6, 3, 1]); // => 0.590128656384365
|
1701 | */
|
1702 | function sampleSkewness(x) {
|
1703 | if (x.length < 3) {
|
1704 | throw new Error("sampleSkewness requires at least three data points");
|
1705 | }
|
1706 |
|
1707 | var meanValue = mean(x);
|
1708 | var tempValue;
|
1709 | var sumSquaredDeviations = 0;
|
1710 | var sumCubedDeviations = 0;
|
1711 |
|
1712 | for (var i = 0; i < x.length; i++) {
|
1713 | tempValue = x[i] - meanValue;
|
1714 | sumSquaredDeviations += tempValue * tempValue;
|
1715 | sumCubedDeviations += tempValue * tempValue * tempValue;
|
1716 | }
|
1717 |
|
1718 | // this is Bessels' Correction: an adjustment made to sample statistics
|
1719 | // that allows for the reduced degree of freedom entailed in calculating
|
1720 | // values from samples rather than complete populations.
|
1721 | var besselsCorrection = x.length - 1;
|
1722 |
|
1723 | // Find the mean value of that list
|
1724 | var theSampleStandardDeviation = Math.sqrt(
|
1725 | sumSquaredDeviations / besselsCorrection
|
1726 | );
|
1727 |
|
1728 | var n = x.length;
|
1729 | var cubedS = Math.pow(theSampleStandardDeviation, 3);
|
1730 |
|
1731 | return (n * sumCubedDeviations) / ((n - 1) * (n - 2) * cubedS);
|
1732 | }
|
1733 |
|
1734 | /**
|
1735 | * [Kurtosis](http://en.wikipedia.org/wiki/Kurtosis) is
|
1736 | * a measure of the heaviness of a distribution's tails relative to its
|
1737 | * variance. The kurtosis value can be positive or negative, or even undefined.
|
1738 | *
|
1739 | * Implementation is based on Fisher's excess kurtosis definition and uses
|
1740 | * unbiased moment estimators. This is the version found in Excel and available
|
1741 | * in several statistical packages, including SAS and SciPy.
|
1742 | *
|
1743 | * @param {Array<number>} x a sample of 4 or more data points
|
1744 | * @returns {number} sample kurtosis
|
1745 | * @throws {Error} if x has length less than 4
|
1746 | * @example
|
1747 | * sampleKurtosis([1, 2, 2, 3, 5]); // => 1.4555765595463122
|
1748 | */
|
1749 | function sampleKurtosis(x) {
|
1750 | var n = x.length;
|
1751 |
|
1752 | if (n < 4) {
|
1753 | throw new Error("sampleKurtosis requires at least four data points");
|
1754 | }
|
1755 |
|
1756 | var meanValue = mean(x);
|
1757 | var tempValue;
|
1758 | var secondCentralMoment = 0;
|
1759 | var fourthCentralMoment = 0;
|
1760 |
|
1761 | for (var i = 0; i < n; i++) {
|
1762 | tempValue = x[i] - meanValue;
|
1763 | secondCentralMoment += tempValue * tempValue;
|
1764 | fourthCentralMoment += tempValue * tempValue * tempValue * tempValue;
|
1765 | }
|
1766 |
|
1767 | return (
|
1768 | ((n - 1) / ((n - 2) * (n - 3))) *
|
1769 | ((n * (n + 1) * fourthCentralMoment) /
|
1770 | (secondCentralMoment * secondCentralMoment) -
|
1771 | 3 * (n - 1))
|
1772 | );
|
1773 | }
|
1774 |
|
1775 | /**
|
1776 | * Implementation of [Heap's Algorithm](https://en.wikipedia.org/wiki/Heap%27s_algorithm)
|
1777 | * for generating permutations.
|
1778 | *
|
1779 | * @param {Array} elements any type of data
|
1780 | * @returns {Array<Array>} array of permutations
|
1781 | */
|
1782 | function permutationsHeap(elements) {
|
1783 | var indexes = new Array(elements.length);
|
1784 | var permutations = [elements.slice()];
|
1785 |
|
1786 | for (var i = 0; i < elements.length; i++) {
|
1787 | indexes[i] = 0;
|
1788 | }
|
1789 |
|
1790 | for (var i$1 = 0; i$1 < elements.length; ) {
|
1791 | if (indexes[i$1] < i$1) {
|
1792 | // At odd indexes, swap from indexes[i] instead
|
1793 | // of from the beginning of the array
|
1794 | var swapFrom = 0;
|
1795 | if (i$1 % 2 !== 0) {
|
1796 | swapFrom = indexes[i$1];
|
1797 | }
|
1798 |
|
1799 | // swap between swapFrom and i, using
|
1800 | // a temporary variable as storage.
|
1801 | var temp = elements[swapFrom];
|
1802 | elements[swapFrom] = elements[i$1];
|
1803 | elements[i$1] = temp;
|
1804 |
|
1805 | permutations.push(elements.slice());
|
1806 | indexes[i$1]++;
|
1807 | i$1 = 0;
|
1808 | } else {
|
1809 | indexes[i$1] = 0;
|
1810 | i$1++;
|
1811 | }
|
1812 | }
|
1813 |
|
1814 | return permutations;
|
1815 | }
|
1816 |
|
1817 | /**
|
1818 | * Implementation of Combinations
|
1819 | * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
|
1820 | * https://en.wikipedia.org/wiki/Combination
|
1821 | * @param {Array} x any type of data
|
1822 | * @param {int} k the number of objects in each group (without replacement)
|
1823 | * @returns {Array<Array>} array of permutations
|
1824 | * @example
|
1825 | * combinations([1, 2, 3], 2); // => [[1,2], [1,3], [2,3]]
|
1826 | */
|
1827 |
|
1828 | function combinations(x, k) {
|
1829 | var i;
|
1830 | var subI;
|
1831 | var combinationList = [];
|
1832 | var subsetCombinations;
|
1833 | var next;
|
1834 |
|
1835 | for (i = 0; i < x.length; i++) {
|
1836 | if (k === 1) {
|
1837 | combinationList.push([x[i]]);
|
1838 | } else {
|
1839 | subsetCombinations = combinations(x.slice(i + 1, x.length), k - 1);
|
1840 | for (subI = 0; subI < subsetCombinations.length; subI++) {
|
1841 | next = subsetCombinations[subI];
|
1842 | next.unshift(x[i]);
|
1843 | combinationList.push(next);
|
1844 | }
|
1845 | }
|
1846 | }
|
1847 | return combinationList;
|
1848 | }
|
1849 |
|
1850 | /**
|
1851 | * Implementation of [Combinations](https://en.wikipedia.org/wiki/Combination) with replacement
|
1852 | * Combinations are unique subsets of a collection - in this case, k x from a collection at a time.
|
1853 | * 'With replacement' means that a given element can be chosen multiple times.
|
1854 | * Unlike permutation, order doesn't matter for combinations.
|
1855 | *
|
1856 | * @param {Array} x any type of data
|
1857 | * @param {int} k the number of objects in each group (without replacement)
|
1858 | * @returns {Array<Array>} array of permutations
|
1859 | * @example
|
1860 | * combinationsReplacement([1, 2], 2); // => [[1, 1], [1, 2], [2, 2]]
|
1861 | */
|
1862 | function combinationsReplacement(x, k) {
|
1863 | var combinationList = [];
|
1864 |
|
1865 | for (var i = 0; i < x.length; i++) {
|
1866 | if (k === 1) {
|
1867 | // If we're requested to find only one element, we don't need
|
1868 | // to recurse: just push `x[i]` onto the list of combinations.
|
1869 | combinationList.push([x[i]]);
|
1870 | } else {
|
1871 | // Otherwise, recursively find combinations, given `k - 1`. Note that
|
1872 | // we request `k - 1`, so if you were looking for k=3 combinations, we're
|
1873 | // requesting k=2. This -1 gets reversed in the for loop right after this
|
1874 | // code, since we concatenate `x[i]` onto the selected combinations,
|
1875 | // bringing `k` back up to your requested level.
|
1876 | // This recursion may go many levels deep, since it only stops once
|
1877 | // k=1.
|
1878 | var subsetCombinations = combinationsReplacement(
|
1879 | x.slice(i, x.length),
|
1880 | k - 1
|
1881 | );
|
1882 |
|
1883 | for (var j = 0; j < subsetCombinations.length; j++) {
|
1884 | combinationList.push([x[i]].concat(subsetCombinations[j]));
|
1885 | }
|
1886 | }
|
1887 | }
|
1888 |
|
1889 | return combinationList;
|
1890 | }
|
1891 |
|
1892 | /**
|
1893 | * When adding a new value to a list, one does not have to necessary
|
1894 | * recompute the mean of the list in linear time. They can instead use
|
1895 | * this function to compute the new mean by providing the current mean,
|
1896 | * the number of elements in the list that produced it and the new
|
1897 | * value to add.
|
1898 | *
|
1899 | * @since 2.5.0
|
1900 | * @param {number} mean current mean
|
1901 | * @param {number} n number of items in the list
|
1902 | * @param {number} newValue the added value
|
1903 | * @returns {number} the new mean
|
1904 | *
|
1905 | * @example
|
1906 | * addToMean(14, 5, 53); // => 20.5
|
1907 | */
|
1908 | function addToMean(mean, n, newValue) {
|
1909 | return mean + (newValue - mean) / (n + 1);
|
1910 | }
|
1911 |
|
1912 | /**
|
1913 | * When combining two lists of values for which one already knows the means,
|
1914 | * one does not have to necessary recompute the mean of the combined lists in
|
1915 | * linear time. They can instead use this function to compute the combined
|
1916 | * mean by providing the mean & number of values of the first list and the mean
|
1917 | * & number of values of the second list.
|
1918 | *
|
1919 | * @since 3.0.0
|
1920 | * @param {number} mean1 mean of the first list
|
1921 | * @param {number} n1 number of items in the first list
|
1922 | * @param {number} mean2 mean of the second list
|
1923 | * @param {number} n2 number of items in the second list
|
1924 | * @returns {number} the combined mean
|
1925 | *
|
1926 | * @example
|
1927 | * combineMeans(5, 3, 4, 3); // => 4.5
|
1928 | */
|
1929 | function combineMeans(mean1, n1, mean2, n2) {
|
1930 | return (mean1 * n1 + mean2 * n2) / (n1 + n2);
|
1931 | }
|
1932 |
|
1933 | /**
|
1934 | * When combining two lists of values for which one already knows the variances,
|
1935 | * one does not have to necessary recompute the variance of the combined lists
|
1936 | * in linear time. They can instead use this function to compute the combined
|
1937 | * variance by providing the variance, mean & number of values of the first list
|
1938 | * and the variance, mean & number of values of the second list.
|
1939 | *
|
1940 | * @since 3.0.0
|
1941 | * @param {number} variance1 variance of the first list
|
1942 | * @param {number} mean1 mean of the first list
|
1943 | * @param {number} n1 number of items in the first list
|
1944 | * @param {number} variance2 variance of the second list
|
1945 | * @param {number} mean2 mean of the second list
|
1946 | * @param {number} n2 number of items in the second list
|
1947 | * @returns {number} the combined mean
|
1948 | *
|
1949 | * @example
|
1950 | * combineVariances(14 / 3, 5, 3, 8 / 3, 4, 3); // => 47 / 12
|
1951 | */
|
1952 | function combineVariances(variance1, mean1, n1, variance2, mean2, n2) {
|
1953 | var newMean = combineMeans(mean1, n1, mean2, n2);
|
1954 |
|
1955 | return (
|
1956 | (n1 * (variance1 + Math.pow(mean1 - newMean, 2)) +
|
1957 | n2 * (variance2 + Math.pow(mean2 - newMean, 2))) /
|
1958 | (n1 + n2)
|
1959 | );
|
1960 | }
|
1961 |
|
1962 | /**
|
1963 | * The [Geometric Mean](https://en.wikipedia.org/wiki/Geometric_mean) is
|
1964 | * a mean function that is more useful for numbers in different
|
1965 | * ranges.
|
1966 | *
|
1967 | * This is the nth root of the input numbers multiplied by each other.
|
1968 | *
|
1969 | * The geometric mean is often useful for
|
1970 | * **[proportional growth](https://en.wikipedia.org/wiki/Geometric_mean#Proportional_growth)**: given
|
1971 | * growth rates for multiple years, like _80%, 16.66% and 42.85%_, a simple
|
1972 | * mean will incorrectly estimate an average growth rate, whereas a geometric
|
1973 | * mean will correctly estimate a growth rate that, over those years,
|
1974 | * will yield the same end value.
|
1975 | *
|
1976 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
1977 | *
|
1978 | * @param {Array<number>} x sample of one or more data points
|
1979 | * @returns {number} geometric mean
|
1980 | * @throws {Error} if x is empty
|
1981 | * @throws {Error} if x contains a negative number
|
1982 | * @example
|
1983 | * var growthRates = [1.80, 1.166666, 1.428571];
|
1984 | * var averageGrowth = ss.geometricMean(growthRates);
|
1985 | * var averageGrowthRates = [averageGrowth, averageGrowth, averageGrowth];
|
1986 | * var startingValue = 10;
|
1987 | * var startingValueMean = 10;
|
1988 | * growthRates.forEach(function(rate) {
|
1989 | * startingValue *= rate;
|
1990 | * });
|
1991 | * averageGrowthRates.forEach(function(rate) {
|
1992 | * startingValueMean *= rate;
|
1993 | * });
|
1994 | * startingValueMean === startingValue;
|
1995 | */
|
1996 | function geometricMean(x) {
|
1997 | if (x.length === 0) {
|
1998 | throw new Error("geometricMean requires at least one data point");
|
1999 | }
|
2000 |
|
2001 | // the starting value.
|
2002 | var value = 1;
|
2003 |
|
2004 | for (var i = 0; i < x.length; i++) {
|
2005 | // the geometric mean is only valid for positive numbers
|
2006 | if (x[i] <= 0) {
|
2007 | throw new Error(
|
2008 | "geometricMean requires only positive numbers as input"
|
2009 | );
|
2010 | }
|
2011 |
|
2012 | // repeatedly multiply the value by each number
|
2013 | value *= x[i];
|
2014 | }
|
2015 |
|
2016 | return Math.pow(value, 1 / x.length);
|
2017 | }
|
2018 |
|
2019 | /**
|
2020 | * The [Harmonic Mean](https://en.wikipedia.org/wiki/Harmonic_mean) is
|
2021 | * a mean function typically used to find the average of rates.
|
2022 | * This mean is calculated by taking the reciprocal of the arithmetic mean
|
2023 | * of the reciprocals of the input numbers.
|
2024 | *
|
2025 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2026 | * a method of finding a typical or central value of a set of numbers.
|
2027 | *
|
2028 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2029 | *
|
2030 | * @param {Array<number>} x sample of one or more data points
|
2031 | * @returns {number} harmonic mean
|
2032 | * @throws {Error} if x is empty
|
2033 | * @throws {Error} if x contains a negative number
|
2034 | * @example
|
2035 | * harmonicMean([2, 3]).toFixed(2) // => '2.40'
|
2036 | */
|
2037 | function harmonicMean(x) {
|
2038 | if (x.length === 0) {
|
2039 | throw new Error("harmonicMean requires at least one data point");
|
2040 | }
|
2041 |
|
2042 | var reciprocalSum = 0;
|
2043 |
|
2044 | for (var i = 0; i < x.length; i++) {
|
2045 | // the harmonic mean is only valid for positive numbers
|
2046 | if (x[i] <= 0) {
|
2047 | throw new Error(
|
2048 | "harmonicMean requires only positive numbers as input"
|
2049 | );
|
2050 | }
|
2051 |
|
2052 | reciprocalSum += 1 / x[i];
|
2053 | }
|
2054 |
|
2055 | // divide n by the the reciprocal sum
|
2056 | return x.length / reciprocalSum;
|
2057 | }
|
2058 |
|
2059 | /**
|
2060 | * The mean, _also known as average_,
|
2061 | * is the sum of all values over the number of values.
|
2062 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2063 | * a method of finding a typical or central value of a set of numbers.
|
2064 | *
|
2065 | * The simple mean uses the successive addition method internally
|
2066 | * to calculate it's result. Errors in floating-point addition are
|
2067 | * not accounted for, so if precision is required, the standard {@link mean}
|
2068 | * method should be used instead.
|
2069 | *
|
2070 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2071 | *
|
2072 | *
|
2073 | * @param {Array<number>} x sample of one or more data points
|
2074 | * @throws {Error} if the the length of x is less than one
|
2075 | * @returns {number} mean
|
2076 | * @example
|
2077 | * mean([0, 10]); // => 5
|
2078 | */
|
2079 | function meanSimple(x) {
|
2080 | if (x.length === 0) {
|
2081 | throw new Error("meanSimple requires at least one data point");
|
2082 | }
|
2083 |
|
2084 | return sumSimple(x) / x.length;
|
2085 | }
|
2086 |
|
2087 | /**
|
2088 | * The [median](http://en.wikipedia.org/wiki/Median) is
|
2089 | * the middle number of a list. This is often a good indicator of 'the middle'
|
2090 | * when there are outliers that skew the `mean()` value.
|
2091 | * This is a [measure of central tendency](https://en.wikipedia.org/wiki/Central_tendency):
|
2092 | * a method of finding a typical or central value of a set of numbers.
|
2093 | *
|
2094 | * The median isn't necessarily one of the elements in the list: the value
|
2095 | * can be the average of two elements if the list has an even length
|
2096 | * and the two central values are different.
|
2097 | *
|
2098 | * @param {Array<number>} sorted input
|
2099 | * @returns {number} median value
|
2100 | * @example
|
2101 | * medianSorted([10, 2, 5, 100, 2, 1]); // => 52.5
|
2102 | */
|
2103 | function medianSorted(sorted) {
|
2104 | return quantileSorted(sorted, 0.5);
|
2105 | }
|
2106 |
|
2107 | /**
|
2108 | * When removing a value from a list, one does not have to necessary
|
2109 | * recompute the mean of the list in linear time. They can instead use
|
2110 | * this function to compute the new mean by providing the current mean,
|
2111 | * the number of elements in the list that produced it and the value to remove.
|
2112 | *
|
2113 | * @since 3.0.0
|
2114 | * @param {number} mean current mean
|
2115 | * @param {number} n number of items in the list
|
2116 | * @param {number} value the value to remove
|
2117 | * @returns {number} the new mean
|
2118 | *
|
2119 | * @example
|
2120 | * subtractFromMean(20.5, 6, 53); // => 14
|
2121 | */
|
2122 | function subtractFromMean(mean, n, value) {
|
2123 | return (mean * n - value) / (n - 1);
|
2124 | }
|
2125 |
|
2126 | /**
|
2127 | * The Root Mean Square (RMS) is
|
2128 | * a mean function used as a measure of the magnitude of a set
|
2129 | * of numbers, regardless of their sign.
|
2130 | * This is the square root of the mean of the squares of the
|
2131 | * input numbers.
|
2132 | * This runs in `O(n)`, linear time, with respect to the length of the array.
|
2133 | *
|
2134 | * @param {Array<number>} x a sample of one or more data points
|
2135 | * @returns {number} root mean square
|
2136 | * @throws {Error} if x is empty
|
2137 | * @example
|
2138 | * rootMeanSquare([-1, 1, -1, 1]); // => 1
|
2139 | */
|
2140 | function rootMeanSquare(x) {
|
2141 | if (x.length === 0) {
|
2142 | throw new Error("rootMeanSquare requires at least one data point");
|
2143 | }
|
2144 |
|
2145 | var sumOfSquares = 0;
|
2146 | for (var i = 0; i < x.length; i++) {
|
2147 | sumOfSquares += Math.pow(x[i], 2);
|
2148 | }
|
2149 |
|
2150 | return Math.sqrt(sumOfSquares / x.length);
|
2151 | }
|
2152 |
|
2153 | /**
|
2154 | * This is to compute [a one-sample t-test](https://en.wikipedia.org/wiki/Student%27s_t-test#One-sample_t-test), comparing the mean
|
2155 | * of a sample to a known value, x.
|
2156 | *
|
2157 | * in this case, we're trying to determine whether the
|
2158 | * population mean is equal to the value that we know, which is `x`
|
2159 | * here. usually the results here are used to look up a
|
2160 | * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
|
2161 | * a certain level of significance, will let you determine that the
|
2162 | * null hypothesis can or cannot be rejected.
|
2163 | *
|
2164 | * @param {Array<number>} x sample of one or more numbers
|
2165 | * @param {number} expectedValue expected value of the population mean
|
2166 | * @returns {number} value
|
2167 | * @example
|
2168 | * tTest([1, 2, 3, 4, 5, 6], 3.385).toFixed(2); // => '0.16'
|
2169 | */
|
2170 | function tTest(x, expectedValue) {
|
2171 | // The mean of the sample
|
2172 | var sampleMean = mean(x);
|
2173 |
|
2174 | // The standard deviation of the sample
|
2175 | var sd = standardDeviation(x);
|
2176 |
|
2177 | // Square root the length of the sample
|
2178 | var rootN = Math.sqrt(x.length);
|
2179 |
|
2180 | // returning the t value
|
2181 | return (sampleMean - expectedValue) / (sd / rootN);
|
2182 | }
|
2183 |
|
2184 | /**
|
2185 | * This is to compute [two sample t-test](http://en.wikipedia.org/wiki/Student's_t-test).
|
2186 | * Tests whether "mean(X)-mean(Y) = difference", (
|
2187 | * in the most common case, we often have `difference == 0` to test if two samples
|
2188 | * are likely to be taken from populations with the same mean value) with
|
2189 | * no prior knowledge on standard deviations of both samples
|
2190 | * other than the fact that they have the same standard deviation.
|
2191 | *
|
2192 | * Usually the results here are used to look up a
|
2193 | * [p-value](http://en.wikipedia.org/wiki/P-value), which, for
|
2194 | * a certain level of significance, will let you determine that the
|
2195 | * null hypothesis can or cannot be rejected.
|
2196 | *
|
2197 | * `diff` can be omitted if it equals 0.
|
2198 | *
|
2199 | * [This is used to confirm or deny](http://www.monarchlab.org/Lab/Research/Stats/2SampleT.aspx)
|
2200 | * a null hypothesis that the two populations that have been sampled into
|
2201 | * `sampleX` and `sampleY` are equal to each other.
|
2202 | *
|
2203 | * @param {Array<number>} sampleX a sample as an array of numbers
|
2204 | * @param {Array<number>} sampleY a sample as an array of numbers
|
2205 | * @param {number} [difference=0]
|
2206 | * @returns {number|null} test result
|
2207 | *
|
2208 | * @example
|
2209 | * tTestTwoSample([1, 2, 3, 4], [3, 4, 5, 6], 0); // => -2.1908902300206643
|
2210 | */
|
2211 | function tTestTwoSample(sampleX, sampleY, difference) {
|
2212 | var n = sampleX.length;
|
2213 | var m = sampleY.length;
|
2214 |
|
2215 | // If either sample doesn't actually have any values, we can't
|
2216 | // compute this at all, so we return `null`.
|
2217 | if (!n || !m) {
|
2218 | return null;
|
2219 | }
|
2220 |
|
2221 | // default difference (mu) is zero
|
2222 | if (!difference) {
|
2223 | difference = 0;
|
2224 | }
|
2225 |
|
2226 | var meanX = mean(sampleX);
|
2227 | var meanY = mean(sampleY);
|
2228 | var sampleVarianceX = sampleVariance(sampleX);
|
2229 | var sampleVarianceY = sampleVariance(sampleY);
|
2230 |
|
2231 | if (
|
2232 | typeof meanX === "number" &&
|
2233 | typeof meanY === "number" &&
|
2234 | typeof sampleVarianceX === "number" &&
|
2235 | typeof sampleVarianceY === "number"
|
2236 | ) {
|
2237 | var weightedVariance =
|
2238 | ((n - 1) * sampleVarianceX + (m - 1) * sampleVarianceY) /
|
2239 | (n + m - 2);
|
2240 |
|
2241 | return (
|
2242 | (meanX - meanY - difference) /
|
2243 | Math.sqrt(weightedVariance * (1 / n + 1 / m))
|
2244 | );
|
2245 | }
|
2246 | }
|
2247 |
|
2248 | /**
|
2249 | * [Bayesian Classifier](http://en.wikipedia.org/wiki/Naive_Bayes_classifier)
|
2250 | *
|
2251 | * This is a naïve bayesian classifier that takes
|
2252 | * singly-nested objects.
|
2253 | *
|
2254 | * @class
|
2255 | * @example
|
2256 | * var bayes = new BayesianClassifier();
|
2257 | * bayes.train({
|
2258 | * species: 'Cat'
|
2259 | * }, 'animal');
|
2260 | * var result = bayes.score({
|
2261 | * species: 'Cat'
|
2262 | * })
|
2263 | * // result
|
2264 | * // {
|
2265 | * // animal: 1
|
2266 | * // }
|
2267 | */
|
2268 | var BayesianClassifier = function BayesianClassifier() {
|
2269 | // The number of items that are currently
|
2270 | // classified in the model
|
2271 | this.totalCount = 0;
|
2272 | // Every item classified in the model
|
2273 | this.data = {};
|
2274 | };
|
2275 |
|
2276 | /**
|
2277 | * Train the classifier with a new item, which has a single
|
2278 | * dimension of Javascript literal keys and values.
|
2279 | *
|
2280 | * @param {Object} item an object with singly-deep properties
|
2281 | * @param {string} category the category this item belongs to
|
2282 | * @return {undefined} adds the item to the classifier
|
2283 | */
|
2284 | BayesianClassifier.prototype.train = function train (item, category) {
|
2285 | // If the data object doesn't have any values
|
2286 | // for this category, create a new object for it.
|
2287 | if (!this.data[category]) {
|
2288 | this.data[category] = {};
|
2289 | }
|
2290 |
|
2291 | // Iterate through each key in the item.
|
2292 | for (var k in item) {
|
2293 | var v = item[k];
|
2294 | // Initialize the nested object `data[category][k][item[k]]`
|
2295 | // with an object of keys that equal 0.
|
2296 | if (this.data[category][k] === undefined) {
|
2297 | this.data[category][k] = {};
|
2298 | }
|
2299 | if (this.data[category][k][v] === undefined) {
|
2300 | this.data[category][k][v] = 0;
|
2301 | }
|
2302 |
|
2303 | // And increment the key for this key/value combination.
|
2304 | this.data[category][k][v]++;
|
2305 | }
|
2306 |
|
2307 | // Increment the number of items classified
|
2308 | this.totalCount++;
|
2309 | };
|
2310 |
|
2311 | /**
|
2312 | * Generate a score of how well this item matches all
|
2313 | * possible categories based on its attributes
|
2314 | *
|
2315 | * @param {Object} item an item in the same format as with train
|
2316 | * @returns {Object} of probabilities that this item belongs to a
|
2317 | * given category.
|
2318 | */
|
2319 | BayesianClassifier.prototype.score = function score (item) {
|
2320 | // Initialize an empty array of odds per category.
|
2321 | var odds = {};
|
2322 | var category;
|
2323 | // Iterate through each key in the item,
|
2324 | // then iterate through each category that has been used
|
2325 | // in previous calls to `.train()`
|
2326 | for (var k in item) {
|
2327 | var v = item[k];
|
2328 | for (category in this.data) {
|
2329 | // Create an empty object for storing key - value combinations
|
2330 | // for this category.
|
2331 | odds[category] = {};
|
2332 |
|
2333 | // If this item doesn't even have a property, it counts for nothing,
|
2334 | // but if it does have the property that we're looking for from
|
2335 | // the item to categorize, it counts based on how popular it is
|
2336 | // versus the whole population.
|
2337 | if (this.data[category][k]) {
|
2338 | odds[category][k + "_" + v] =
|
2339 | (this.data[category][k][v] || 0) / this.totalCount;
|
2340 | } else {
|
2341 | odds[category][k + "_" + v] = 0;
|
2342 | }
|
2343 | }
|
2344 | }
|
2345 |
|
2346 | // Set up a new object that will contain sums of these odds by category
|
2347 | var oddsSums = {};
|
2348 |
|
2349 | for (category in odds) {
|
2350 | // Tally all of the odds for each category-combination pair -
|
2351 | // the non-existence of a category does not add anything to the
|
2352 | // score.
|
2353 | oddsSums[category] = 0;
|
2354 | for (var combination in odds[category]) {
|
2355 | oddsSums[category] += odds[category][combination];
|
2356 | }
|
2357 | }
|
2358 |
|
2359 | return oddsSums;
|
2360 | };
|
2361 |
|
2362 | /**
|
2363 | * This is a single-layer [Perceptron Classifier](http://en.wikipedia.org/wiki/Perceptron) that takes
|
2364 | * arrays of numbers and predicts whether they should be classified
|
2365 | * as either 0 or 1 (negative or positive examples).
|
2366 | * @class
|
2367 | * @example
|
2368 | * // Create the model
|
2369 | * var p = new PerceptronModel();
|
2370 | * // Train the model with input with a diagonal boundary.
|
2371 | * for (var i = 0; i < 5; i++) {
|
2372 | * p.train([1, 1], 1);
|
2373 | * p.train([0, 1], 0);
|
2374 | * p.train([1, 0], 0);
|
2375 | * p.train([0, 0], 0);
|
2376 | * }
|
2377 | * p.predict([0, 0]); // 0
|
2378 | * p.predict([0, 1]); // 0
|
2379 | * p.predict([1, 0]); // 0
|
2380 | * p.predict([1, 1]); // 1
|
2381 | */
|
2382 | var PerceptronModel = function PerceptronModel() {
|
2383 | // The weights, or coefficients of the model;
|
2384 | // weights are only populated when training with data.
|
2385 | this.weights = [];
|
2386 | // The bias term, or intercept; it is also a weight but
|
2387 | // it's stored separately for convenience as it is always
|
2388 | // multiplied by one.
|
2389 | this.bias = 0;
|
2390 | };
|
2391 | /**
|
2392 | * **Predict**: Use an array of features with the weight array and bias
|
2393 | * to predict whether an example is labeled 0 or 1.
|
2394 | *
|
2395 | * @param {Array<number>} features an array of features as numbers
|
2396 | * @returns {number} 1 if the score is over 0, otherwise 0
|
2397 | */
|
2398 | PerceptronModel.prototype.predict = function predict (features) {
|
2399 | // Only predict if previously trained
|
2400 | // on the same size feature array(s).
|
2401 | if (features.length !== this.weights.length) {
|
2402 | return null;
|
2403 | }
|
2404 |
|
2405 | // Calculate the sum of features times weights,
|
2406 | // with the bias added (implicitly times one).
|
2407 | var score = 0;
|
2408 | for (var i = 0; i < this.weights.length; i++) {
|
2409 | score += this.weights[i] * features[i];
|
2410 | }
|
2411 | score += this.bias;
|
2412 |
|
2413 | // Classify as 1 if the score is over 0, otherwise 0.
|
2414 | if (score > 0) {
|
2415 | return 1;
|
2416 | } else {
|
2417 | return 0;
|
2418 | }
|
2419 | };
|
2420 |
|
2421 | /**
|
2422 | * **Train** the classifier with a new example, which is
|
2423 | * a numeric array of features and a 0 or 1 label.
|
2424 | *
|
2425 | * @param {Array<number>} features an array of features as numbers
|
2426 | * @param {number} label either 0 or 1
|
2427 | * @returns {PerceptronModel} this
|
2428 | */
|
2429 | PerceptronModel.prototype.train = function train (features, label) {
|
2430 | // Require that only labels of 0 or 1 are considered.
|
2431 | if (label !== 0 && label !== 1) {
|
2432 | return null;
|
2433 | }
|
2434 | // The length of the feature array determines
|
2435 | // the length of the weight array.
|
2436 | // The perceptron will continue learning as long as
|
2437 | // it keeps seeing feature arrays of the same length.
|
2438 | // When it sees a new data shape, it initializes.
|
2439 | if (features.length !== this.weights.length) {
|
2440 | this.weights = features;
|
2441 | this.bias = 1;
|
2442 | }
|
2443 | // Make a prediction based on current weights.
|
2444 | var prediction = this.predict(features);
|
2445 | // Update the weights if the prediction is wrong.
|
2446 | if (typeof prediction === "number" && prediction !== label) {
|
2447 | var gradient = label - prediction;
|
2448 | for (var i = 0; i < this.weights.length; i++) {
|
2449 | this.weights[i] += gradient * features[i];
|
2450 | }
|
2451 | this.bias += gradient;
|
2452 | }
|
2453 | return this;
|
2454 | };
|
2455 |
|
2456 | /**
|
2457 | * We use `ε`, epsilon, as a stopping criterion when we want to iterate
|
2458 | * until we're "close enough". Epsilon is a very small number: for
|
2459 | * simple statistics, that number is **0.0001**
|
2460 | *
|
2461 | * This is used in calculations like the binomialDistribution, in which
|
2462 | * the process of finding a value is [iterative](https://en.wikipedia.org/wiki/Iterative_method):
|
2463 | * it progresses until it is close enough.
|
2464 | *
|
2465 | * Below is an example of using epsilon in [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent),
|
2466 | * where we're trying to find a local minimum of a function's derivative,
|
2467 | * given by the `fDerivative` method.
|
2468 | *
|
2469 | * @example
|
2470 | * // From calculation, we expect that the local minimum occurs at x=9/4
|
2471 | * var x_old = 0;
|
2472 | * // The algorithm starts at x=6
|
2473 | * var x_new = 6;
|
2474 | * var stepSize = 0.01;
|
2475 | *
|
2476 | * function fDerivative(x) {
|
2477 | * return 4 * Math.pow(x, 3) - 9 * Math.pow(x, 2);
|
2478 | * }
|
2479 | *
|
2480 | * // The loop runs until the difference between the previous
|
2481 | * // value and the current value is smaller than epsilon - a rough
|
2482 | * // meaure of 'close enough'
|
2483 | * while (Math.abs(x_new - x_old) > ss.epsilon) {
|
2484 | * x_old = x_new;
|
2485 | * x_new = x_old - stepSize * fDerivative(x_old);
|
2486 | * }
|
2487 | *
|
2488 | * console.log('Local minimum occurs at', x_new);
|
2489 | */
|
2490 | var epsilon = 0.0001;
|
2491 |
|
2492 | /**
|
2493 | * A [Factorial](https://en.wikipedia.org/wiki/Factorial), usually written n!, is the product of all positive
|
2494 | * integers less than or equal to n. Often factorial is implemented
|
2495 | * recursively, but this iterative approach is significantly faster
|
2496 | * and simpler.
|
2497 | *
|
2498 | * @param {number} n input, must be an integer number 1 or greater
|
2499 | * @returns {number} factorial: n!
|
2500 | * @throws {Error} if n is less than 0 or not an integer
|
2501 | * @example
|
2502 | * factorial(5); // => 120
|
2503 | */
|
2504 | function factorial(n) {
|
2505 | // factorial is mathematically undefined for negative numbers
|
2506 | if (n < 0) {
|
2507 | throw new Error("factorial requires a non-negative value");
|
2508 | }
|
2509 |
|
2510 | if (Math.floor(n) !== n) {
|
2511 | throw new Error("factorial requires an integer input");
|
2512 | }
|
2513 |
|
2514 | // typically you'll expand the factorial function going down, like
|
2515 | // 5! = 5 * 4 * 3 * 2 * 1. This is going in the opposite direction,
|
2516 | // counting from 2 up to the number in question, and since anything
|
2517 | // multiplied by 1 is itself, the loop only needs to start at 2.
|
2518 | var accumulator = 1;
|
2519 | for (var i = 2; i <= n; i++) {
|
2520 | // for each number up to and including the number `n`, multiply
|
2521 | // the accumulator my that number.
|
2522 | accumulator *= i;
|
2523 | }
|
2524 | return accumulator;
|
2525 | }
|
2526 |
|
2527 | /**
|
2528 | * Compute the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Nemes' approximation.
|
2529 | * The gamma of n is equivalent to (n-1)!, but unlike the factorial function, gamma is defined for all real n except zero
|
2530 | * and negative integers (where NaN is returned). Note, the gamma function is also well-defined for complex numbers,
|
2531 | * though this implementation currently does not handle complex numbers as input values.
|
2532 | * Nemes' approximation is defined [here](https://arxiv.org/abs/1003.6020) as Theorem 2.2.
|
2533 | * Negative values use [Euler's reflection formula](https://en.wikipedia.org/wiki/Gamma_function#Properties) for computation.
|
2534 | *
|
2535 | * @param {number} n Any real number except for zero and negative integers.
|
2536 | * @returns {number} The gamma of the input value.
|
2537 | *
|
2538 | * @example
|
2539 | * gamma(11.5); // 11899423.084037038
|
2540 | * gamma(-11.5); // 2.29575810481609e-8
|
2541 | * gamma(5); // 24
|
2542 | */
|
2543 | function gamma(n) {
|
2544 | if (Number.isInteger(n)) {
|
2545 | if (n <= 0) {
|
2546 | // gamma not defined for zero or negative integers
|
2547 | return NaN;
|
2548 | } else {
|
2549 | // use factorial for integer inputs
|
2550 | return factorial(n - 1);
|
2551 | }
|
2552 | }
|
2553 |
|
2554 | // Decrement n, because approximation is defined for n - 1
|
2555 | n--;
|
2556 |
|
2557 | if (n < 0) {
|
2558 | // Use Euler's reflection formula for negative inputs
|
2559 | // see: https://en.wikipedia.org/wiki/Gamma_function#Properties
|
2560 | return Math.PI / (Math.sin(Math.PI * -n) * gamma(-n));
|
2561 | } else {
|
2562 | // Nemes' expansion approximation
|
2563 | var seriesCoefficient =
|
2564 | Math.pow(n / Math.E, n) * Math.sqrt(2 * Math.PI * (n + 1 / 6));
|
2565 |
|
2566 | var seriesDenom = n + 1 / 4;
|
2567 |
|
2568 | var seriesExpansion =
|
2569 | 1 +
|
2570 | 1 / 144 / Math.pow(seriesDenom, 2) -
|
2571 | 1 / 12960 / Math.pow(seriesDenom, 3) -
|
2572 | 257 / 207360 / Math.pow(seriesDenom, 4) -
|
2573 | 52 / 2612736 / Math.pow(seriesDenom, 5) +
|
2574 | 5741173 / 9405849600 / Math.pow(seriesDenom, 6) +
|
2575 | 37529 / 18811699200 / Math.pow(seriesDenom, 7);
|
2576 |
|
2577 | return seriesCoefficient * seriesExpansion;
|
2578 | }
|
2579 | }
|
2580 |
|
2581 | // Define series coefficients
|
2582 | var COEFFICIENTS = [
|
2583 | 0.99999999999999709182,
|
2584 | 57.156235665862923517,
|
2585 | -59.597960355475491248,
|
2586 | 14.136097974741747174,
|
2587 | -0.49191381609762019978,
|
2588 | 0.33994649984811888699e-4,
|
2589 | 0.46523628927048575665e-4,
|
2590 | -0.98374475304879564677e-4,
|
2591 | 0.15808870322491248884e-3,
|
2592 | -0.21026444172410488319e-3,
|
2593 | 0.2174396181152126432e-3,
|
2594 | -0.16431810653676389022e-3,
|
2595 | 0.84418223983852743293e-4,
|
2596 | -0.2619083840158140867e-4,
|
2597 | 0.36899182659531622704e-5
|
2598 | ];
|
2599 |
|
2600 | var g = 607 / 128;
|
2601 | var LOGSQRT2PI = Math.log(Math.sqrt(2 * Math.PI));
|
2602 |
|
2603 | /**
|
2604 | * Compute the logarithm of the [gamma function](https://en.wikipedia.org/wiki/Gamma_function) of a value using Lanczos' approximation.
|
2605 | * This function takes as input any real-value n greater than 0.
|
2606 | * This function is useful for values of n too large for the normal gamma function (n > 165).
|
2607 | * The code is based on Lanczo's Gamma approximation, defined [here](http://my.fit.edu/~gabdo/gamma.txt).
|
2608 | *
|
2609 | * @param {number} n Any real number greater than zero.
|
2610 | * @returns {number} The logarithm of gamma of the input value.
|
2611 | *
|
2612 | * @example
|
2613 | * gammaln(500); // 2605.1158503617335
|
2614 | * gammaln(2.4); // 0.21685932244884043
|
2615 | */
|
2616 | function gammaln(n) {
|
2617 | // Return infinity if value not in domain
|
2618 | if (n <= 0) {
|
2619 | return Infinity;
|
2620 | }
|
2621 |
|
2622 | // Decrement n, because approximation is defined for n - 1
|
2623 | n--;
|
2624 |
|
2625 | // Create series approximation
|
2626 | var a = COEFFICIENTS[0];
|
2627 |
|
2628 | for (var i = 1; i < 15; i++) {
|
2629 | a += COEFFICIENTS[i] / (n + i);
|
2630 | }
|
2631 |
|
2632 | var tmp = g + 0.5 + n;
|
2633 |
|
2634 | // Return natural logarithm of gamma(n)
|
2635 | return LOGSQRT2PI + Math.log(a) - tmp + (n + 0.5) * Math.log(tmp);
|
2636 | }
|
2637 |
|
2638 | /**
|
2639 | * The [Bernoulli distribution](http://en.wikipedia.org/wiki/Bernoulli_distribution)
|
2640 | * is the probability discrete
|
2641 | * distribution of a random variable which takes value 1 with success
|
2642 | * probability `p` and value 0 with failure
|
2643 | * probability `q` = 1 - `p`. It can be used, for example, to represent the
|
2644 | * toss of a coin, where "1" is defined to mean "heads" and "0" is defined
|
2645 | * to mean "tails" (or vice versa). It is
|
2646 | * a special case of a Binomial Distribution
|
2647 | * where `n` = 1.
|
2648 | *
|
2649 | * @param {number} p input value, between 0 and 1 inclusive
|
2650 | * @returns {number[]} values of bernoulli distribution at this point
|
2651 | * @throws {Error} if p is outside 0 and 1
|
2652 | * @example
|
2653 | * bernoulliDistribution(0.3); // => [0.7, 0.3]
|
2654 | */
|
2655 | function bernoulliDistribution(p) /*: number[] */ {
|
2656 | // Check that `p` is a valid probability (0 ≤ p ≤ 1)
|
2657 | if (p < 0 || p > 1) {
|
2658 | throw new Error(
|
2659 | "bernoulliDistribution requires probability to be between 0 and 1 inclusive"
|
2660 | );
|
2661 | }
|
2662 |
|
2663 | return [1 - p, p];
|
2664 | }
|
2665 |
|
2666 | /**
|
2667 | * The [Binomial Distribution](http://en.wikipedia.org/wiki/Binomial_distribution) is the discrete probability
|
2668 | * distribution of the number of successes in a sequence of n independent yes/no experiments, each of which yields
|
2669 | * success with probability `probability`. Such a success/failure experiment is also called a Bernoulli experiment or
|
2670 | * Bernoulli trial; when trials = 1, the Binomial Distribution is a Bernoulli Distribution.
|
2671 | *
|
2672 | * @param {number} trials number of trials to simulate
|
2673 | * @param {number} probability
|
2674 | * @returns {number[]} output
|
2675 | */
|
2676 | function binomialDistribution(trials, probability) /*: ?number[] */ {
|
2677 | // Check that `p` is a valid probability (0 ≤ p ≤ 1),
|
2678 | // that `n` is an integer, strictly positive.
|
2679 | if (probability < 0 || probability > 1 || trials <= 0 || trials % 1 !== 0) {
|
2680 | return undefined;
|
2681 | }
|
2682 |
|
2683 | // We initialize `x`, the random variable, and `accumulator`, an accumulator
|
2684 | // for the cumulative distribution function to 0. `distribution_functions`
|
2685 | // is the object we'll return with the `probability_of_x` and the
|
2686 | // `cumulativeProbability_of_x`, as well as the calculated mean &
|
2687 | // variance. We iterate until the `cumulativeProbability_of_x` is
|
2688 | // within `epsilon` of 1.0.
|
2689 | var x = 0;
|
2690 | var cumulativeProbability = 0;
|
2691 | var cells = [];
|
2692 | var binomialCoefficient = 1;
|
2693 |
|
2694 | // This algorithm iterates through each potential outcome,
|
2695 | // until the `cumulativeProbability` is very close to 1, at
|
2696 | // which point we've defined the vast majority of outcomes
|
2697 | do {
|
2698 | // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
|
2699 | cells[x] =
|
2700 | binomialCoefficient *
|
2701 | Math.pow(probability, x) *
|
2702 | Math.pow(1 - probability, trials - x);
|
2703 | cumulativeProbability += cells[x];
|
2704 | x++;
|
2705 | binomialCoefficient = (binomialCoefficient * (trials - x + 1)) / x;
|
2706 | // when the cumulativeProbability is nearly 1, we've calculated
|
2707 | // the useful range of this distribution
|
2708 | } while (cumulativeProbability < 1 - epsilon);
|
2709 |
|
2710 | return cells;
|
2711 | }
|
2712 |
|
2713 | /**
|
2714 | * The [Poisson Distribution](http://en.wikipedia.org/wiki/Poisson_distribution)
|
2715 | * is a discrete probability distribution that expresses the probability
|
2716 | * of a given number of events occurring in a fixed interval of time
|
2717 | * and/or space if these events occur with a known average rate and
|
2718 | * independently of the time since the last event.
|
2719 | *
|
2720 | * The Poisson Distribution is characterized by the strictly positive
|
2721 | * mean arrival or occurrence rate, `λ`.
|
2722 | *
|
2723 | * @param {number} lambda location poisson distribution
|
2724 | * @returns {number[]} values of poisson distribution at that point
|
2725 | */
|
2726 | function poissonDistribution(lambda) /*: ?number[] */ {
|
2727 | // Check that lambda is strictly positive
|
2728 | if (lambda <= 0) {
|
2729 | return undefined;
|
2730 | }
|
2731 |
|
2732 | // our current place in the distribution
|
2733 | var x = 0;
|
2734 | // and we keep track of the current cumulative probability, in
|
2735 | // order to know when to stop calculating chances.
|
2736 | var cumulativeProbability = 0;
|
2737 | // the calculated cells to be returned
|
2738 | var cells = [];
|
2739 | var factorialX = 1;
|
2740 |
|
2741 | // This algorithm iterates through each potential outcome,
|
2742 | // until the `cumulativeProbability` is very close to 1, at
|
2743 | // which point we've defined the vast majority of outcomes
|
2744 | do {
|
2745 | // a [probability mass function](https://en.wikipedia.org/wiki/Probability_mass_function)
|
2746 | cells[x] = (Math.exp(-lambda) * Math.pow(lambda, x)) / factorialX;
|
2747 | cumulativeProbability += cells[x];
|
2748 | x++;
|
2749 | factorialX *= x;
|
2750 | // when the cumulativeProbability is nearly 1, we've calculated
|
2751 | // the useful range of this distribution
|
2752 | } while (cumulativeProbability < 1 - epsilon);
|
2753 |
|
2754 | return cells;
|
2755 | }
|
2756 |
|
2757 | /**
|
2758 | * **Percentage Points of the χ2 (Chi-Squared) Distribution**
|
2759 | *
|
2760 | * The [χ2 (Chi-Squared) Distribution](http://en.wikipedia.org/wiki/Chi-squared_distribution) is used in the common
|
2761 | * chi-squared tests for goodness of fit of an observed distribution to a theoretical one, the independence of two
|
2762 | * criteria of classification of qualitative data, and in confidence interval estimation for a population standard
|
2763 | * deviation of a normal distribution from a sample standard deviation.
|
2764 | *
|
2765 | * Values from Appendix 1, Table III of William W. Hines & Douglas C. Montgomery, "Probability and Statistics in
|
2766 | * Engineering and Management Science", Wiley (1980).
|
2767 | */
|
2768 | var chiSquaredDistributionTable = {
|
2769 | 1: {
|
2770 | 0.995: 0,
|
2771 | 0.99: 0,
|
2772 | 0.975: 0,
|
2773 | 0.95: 0,
|
2774 | 0.9: 0.02,
|
2775 | 0.5: 0.45,
|
2776 | 0.1: 2.71,
|
2777 | 0.05: 3.84,
|
2778 | 0.025: 5.02,
|
2779 | 0.01: 6.63,
|
2780 | 0.005: 7.88
|
2781 | },
|
2782 | 2: {
|
2783 | 0.995: 0.01,
|
2784 | 0.99: 0.02,
|
2785 | 0.975: 0.05,
|
2786 | 0.95: 0.1,
|
2787 | 0.9: 0.21,
|
2788 | 0.5: 1.39,
|
2789 | 0.1: 4.61,
|
2790 | 0.05: 5.99,
|
2791 | 0.025: 7.38,
|
2792 | 0.01: 9.21,
|
2793 | 0.005: 10.6
|
2794 | },
|
2795 | 3: {
|
2796 | 0.995: 0.07,
|
2797 | 0.99: 0.11,
|
2798 | 0.975: 0.22,
|
2799 | 0.95: 0.35,
|
2800 | 0.9: 0.58,
|
2801 | 0.5: 2.37,
|
2802 | 0.1: 6.25,
|
2803 | 0.05: 7.81,
|
2804 | 0.025: 9.35,
|
2805 | 0.01: 11.34,
|
2806 | 0.005: 12.84
|
2807 | },
|
2808 | 4: {
|
2809 | 0.995: 0.21,
|
2810 | 0.99: 0.3,
|
2811 | 0.975: 0.48,
|
2812 | 0.95: 0.71,
|
2813 | 0.9: 1.06,
|
2814 | 0.5: 3.36,
|
2815 | 0.1: 7.78,
|
2816 | 0.05: 9.49,
|
2817 | 0.025: 11.14,
|
2818 | 0.01: 13.28,
|
2819 | 0.005: 14.86
|
2820 | },
|
2821 | 5: {
|
2822 | 0.995: 0.41,
|
2823 | 0.99: 0.55,
|
2824 | 0.975: 0.83,
|
2825 | 0.95: 1.15,
|
2826 | 0.9: 1.61,
|
2827 | 0.5: 4.35,
|
2828 | 0.1: 9.24,
|
2829 | 0.05: 11.07,
|
2830 | 0.025: 12.83,
|
2831 | 0.01: 15.09,
|
2832 | 0.005: 16.75
|
2833 | },
|
2834 | 6: {
|
2835 | 0.995: 0.68,
|
2836 | 0.99: 0.87,
|
2837 | 0.975: 1.24,
|
2838 | 0.95: 1.64,
|
2839 | 0.9: 2.2,
|
2840 | 0.5: 5.35,
|
2841 | 0.1: 10.65,
|
2842 | 0.05: 12.59,
|
2843 | 0.025: 14.45,
|
2844 | 0.01: 16.81,
|
2845 | 0.005: 18.55
|
2846 | },
|
2847 | 7: {
|
2848 | 0.995: 0.99,
|
2849 | 0.99: 1.25,
|
2850 | 0.975: 1.69,
|
2851 | 0.95: 2.17,
|
2852 | 0.9: 2.83,
|
2853 | 0.5: 6.35,
|
2854 | 0.1: 12.02,
|
2855 | 0.05: 14.07,
|
2856 | 0.025: 16.01,
|
2857 | 0.01: 18.48,
|
2858 | 0.005: 20.28
|
2859 | },
|
2860 | 8: {
|
2861 | 0.995: 1.34,
|
2862 | 0.99: 1.65,
|
2863 | 0.975: 2.18,
|
2864 | 0.95: 2.73,
|
2865 | 0.9: 3.49,
|
2866 | 0.5: 7.34,
|
2867 | 0.1: 13.36,
|
2868 | 0.05: 15.51,
|
2869 | 0.025: 17.53,
|
2870 | 0.01: 20.09,
|
2871 | 0.005: 21.96
|
2872 | },
|
2873 | 9: {
|
2874 | 0.995: 1.73,
|
2875 | 0.99: 2.09,
|
2876 | 0.975: 2.7,
|
2877 | 0.95: 3.33,
|
2878 | 0.9: 4.17,
|
2879 | 0.5: 8.34,
|
2880 | 0.1: 14.68,
|
2881 | 0.05: 16.92,
|
2882 | 0.025: 19.02,
|
2883 | 0.01: 21.67,
|
2884 | 0.005: 23.59
|
2885 | },
|
2886 | 10: {
|
2887 | 0.995: 2.16,
|
2888 | 0.99: 2.56,
|
2889 | 0.975: 3.25,
|
2890 | 0.95: 3.94,
|
2891 | 0.9: 4.87,
|
2892 | 0.5: 9.34,
|
2893 | 0.1: 15.99,
|
2894 | 0.05: 18.31,
|
2895 | 0.025: 20.48,
|
2896 | 0.01: 23.21,
|
2897 | 0.005: 25.19
|
2898 | },
|
2899 | 11: {
|
2900 | 0.995: 2.6,
|
2901 | 0.99: 3.05,
|
2902 | 0.975: 3.82,
|
2903 | 0.95: 4.57,
|
2904 | 0.9: 5.58,
|
2905 | 0.5: 10.34,
|
2906 | 0.1: 17.28,
|
2907 | 0.05: 19.68,
|
2908 | 0.025: 21.92,
|
2909 | 0.01: 24.72,
|
2910 | 0.005: 26.76
|
2911 | },
|
2912 | 12: {
|
2913 | 0.995: 3.07,
|
2914 | 0.99: 3.57,
|
2915 | 0.975: 4.4,
|
2916 | 0.95: 5.23,
|
2917 | 0.9: 6.3,
|
2918 | 0.5: 11.34,
|
2919 | 0.1: 18.55,
|
2920 | 0.05: 21.03,
|
2921 | 0.025: 23.34,
|
2922 | 0.01: 26.22,
|
2923 | 0.005: 28.3
|
2924 | },
|
2925 | 13: {
|
2926 | 0.995: 3.57,
|
2927 | 0.99: 4.11,
|
2928 | 0.975: 5.01,
|
2929 | 0.95: 5.89,
|
2930 | 0.9: 7.04,
|
2931 | 0.5: 12.34,
|
2932 | 0.1: 19.81,
|
2933 | 0.05: 22.36,
|
2934 | 0.025: 24.74,
|
2935 | 0.01: 27.69,
|
2936 | 0.005: 29.82
|
2937 | },
|
2938 | 14: {
|
2939 | 0.995: 4.07,
|
2940 | 0.99: 4.66,
|
2941 | 0.975: 5.63,
|
2942 | 0.95: 6.57,
|
2943 | 0.9: 7.79,
|
2944 | 0.5: 13.34,
|
2945 | 0.1: 21.06,
|
2946 | 0.05: 23.68,
|
2947 | 0.025: 26.12,
|
2948 | 0.01: 29.14,
|
2949 | 0.005: 31.32
|
2950 | },
|
2951 | 15: {
|
2952 | 0.995: 4.6,
|
2953 | 0.99: 5.23,
|
2954 | 0.975: 6.27,
|
2955 | 0.95: 7.26,
|
2956 | 0.9: 8.55,
|
2957 | 0.5: 14.34,
|
2958 | 0.1: 22.31,
|
2959 | 0.05: 25,
|
2960 | 0.025: 27.49,
|
2961 | 0.01: 30.58,
|
2962 | 0.005: 32.8
|
2963 | },
|
2964 | 16: {
|
2965 | 0.995: 5.14,
|
2966 | 0.99: 5.81,
|
2967 | 0.975: 6.91,
|
2968 | 0.95: 7.96,
|
2969 | 0.9: 9.31,
|
2970 | 0.5: 15.34,
|
2971 | 0.1: 23.54,
|
2972 | 0.05: 26.3,
|
2973 | 0.025: 28.85,
|
2974 | 0.01: 32,
|
2975 | 0.005: 34.27
|
2976 | },
|
2977 | 17: {
|
2978 | 0.995: 5.7,
|
2979 | 0.99: 6.41,
|
2980 | 0.975: 7.56,
|
2981 | 0.95: 8.67,
|
2982 | 0.9: 10.09,
|
2983 | 0.5: 16.34,
|
2984 | 0.1: 24.77,
|
2985 | 0.05: 27.59,
|
2986 | 0.025: 30.19,
|
2987 | 0.01: 33.41,
|
2988 | 0.005: 35.72
|
2989 | },
|
2990 | 18: {
|
2991 | 0.995: 6.26,
|
2992 | 0.99: 7.01,
|
2993 | 0.975: 8.23,
|
2994 | 0.95: 9.39,
|
2995 | 0.9: 10.87,
|
2996 | 0.5: 17.34,
|
2997 | 0.1: 25.99,
|
2998 | 0.05: 28.87,
|
2999 | 0.025: 31.53,
|
3000 | 0.01: 34.81,
|
3001 | 0.005: 37.16
|
3002 | },
|
3003 | 19: {
|
3004 | 0.995: 6.84,
|
3005 | 0.99: 7.63,
|
3006 | 0.975: 8.91,
|
3007 | 0.95: 10.12,
|
3008 | 0.9: 11.65,
|
3009 | 0.5: 18.34,
|
3010 | 0.1: 27.2,
|
3011 | 0.05: 30.14,
|
3012 | 0.025: 32.85,
|
3013 | 0.01: 36.19,
|
3014 | 0.005: 38.58
|
3015 | },
|
3016 | 20: {
|
3017 | 0.995: 7.43,
|
3018 | 0.99: 8.26,
|
3019 | 0.975: 9.59,
|
3020 | 0.95: 10.85,
|
3021 | 0.9: 12.44,
|
3022 | 0.5: 19.34,
|
3023 | 0.1: 28.41,
|
3024 | 0.05: 31.41,
|
3025 | 0.025: 34.17,
|
3026 | 0.01: 37.57,
|
3027 | 0.005: 40
|
3028 | },
|
3029 | 21: {
|
3030 | 0.995: 8.03,
|
3031 | 0.99: 8.9,
|
3032 | 0.975: 10.28,
|
3033 | 0.95: 11.59,
|
3034 | 0.9: 13.24,
|
3035 | 0.5: 20.34,
|
3036 | 0.1: 29.62,
|
3037 | 0.05: 32.67,
|
3038 | 0.025: 35.48,
|
3039 | 0.01: 38.93,
|
3040 | 0.005: 41.4
|
3041 | },
|
3042 | 22: {
|
3043 | 0.995: 8.64,
|
3044 | 0.99: 9.54,
|
3045 | 0.975: 10.98,
|
3046 | 0.95: 12.34,
|
3047 | 0.9: 14.04,
|
3048 | 0.5: 21.34,
|
3049 | 0.1: 30.81,
|
3050 | 0.05: 33.92,
|
3051 | 0.025: 36.78,
|
3052 | 0.01: 40.29,
|
3053 | 0.005: 42.8
|
3054 | },
|
3055 | 23: {
|
3056 | 0.995: 9.26,
|
3057 | 0.99: 10.2,
|
3058 | 0.975: 11.69,
|
3059 | 0.95: 13.09,
|
3060 | 0.9: 14.85,
|
3061 | 0.5: 22.34,
|
3062 | 0.1: 32.01,
|
3063 | 0.05: 35.17,
|
3064 | 0.025: 38.08,
|
3065 | 0.01: 41.64,
|
3066 | 0.005: 44.18
|
3067 | },
|
3068 | 24: {
|
3069 | 0.995: 9.89,
|
3070 | 0.99: 10.86,
|
3071 | 0.975: 12.4,
|
3072 | 0.95: 13.85,
|
3073 | 0.9: 15.66,
|
3074 | 0.5: 23.34,
|
3075 | 0.1: 33.2,
|
3076 | 0.05: 36.42,
|
3077 | 0.025: 39.36,
|
3078 | 0.01: 42.98,
|
3079 | 0.005: 45.56
|
3080 | },
|
3081 | 25: {
|
3082 | 0.995: 10.52,
|
3083 | 0.99: 11.52,
|
3084 | 0.975: 13.12,
|
3085 | 0.95: 14.61,
|
3086 | 0.9: 16.47,
|
3087 | 0.5: 24.34,
|
3088 | 0.1: 34.28,
|
3089 | 0.05: 37.65,
|
3090 | 0.025: 40.65,
|
3091 | 0.01: 44.31,
|
3092 | 0.005: 46.93
|
3093 | },
|
3094 | 26: {
|
3095 | 0.995: 11.16,
|
3096 | 0.99: 12.2,
|
3097 | 0.975: 13.84,
|
3098 | 0.95: 15.38,
|
3099 | 0.9: 17.29,
|
3100 | 0.5: 25.34,
|
3101 | 0.1: 35.56,
|
3102 | 0.05: 38.89,
|
3103 | 0.025: 41.92,
|
3104 | 0.01: 45.64,
|
3105 | 0.005: 48.29
|
3106 | },
|
3107 | 27: {
|
3108 | 0.995: 11.81,
|
3109 | 0.99: 12.88,
|
3110 | 0.975: 14.57,
|
3111 | 0.95: 16.15,
|
3112 | 0.9: 18.11,
|
3113 | 0.5: 26.34,
|
3114 | 0.1: 36.74,
|
3115 | 0.05: 40.11,
|
3116 | 0.025: 43.19,
|
3117 | 0.01: 46.96,
|
3118 | 0.005: 49.65
|
3119 | },
|
3120 | 28: {
|
3121 | 0.995: 12.46,
|
3122 | 0.99: 13.57,
|
3123 | 0.975: 15.31,
|
3124 | 0.95: 16.93,
|
3125 | 0.9: 18.94,
|
3126 | 0.5: 27.34,
|
3127 | 0.1: 37.92,
|
3128 | 0.05: 41.34,
|
3129 | 0.025: 44.46,
|
3130 | 0.01: 48.28,
|
3131 | 0.005: 50.99
|
3132 | },
|
3133 | 29: {
|
3134 | 0.995: 13.12,
|
3135 | 0.99: 14.26,
|
3136 | 0.975: 16.05,
|
3137 | 0.95: 17.71,
|
3138 | 0.9: 19.77,
|
3139 | 0.5: 28.34,
|
3140 | 0.1: 39.09,
|
3141 | 0.05: 42.56,
|
3142 | 0.025: 45.72,
|
3143 | 0.01: 49.59,
|
3144 | 0.005: 52.34
|
3145 | },
|
3146 | 30: {
|
3147 | 0.995: 13.79,
|
3148 | 0.99: 14.95,
|
3149 | 0.975: 16.79,
|
3150 | 0.95: 18.49,
|
3151 | 0.9: 20.6,
|
3152 | 0.5: 29.34,
|
3153 | 0.1: 40.26,
|
3154 | 0.05: 43.77,
|
3155 | 0.025: 46.98,
|
3156 | 0.01: 50.89,
|
3157 | 0.005: 53.67
|
3158 | },
|
3159 | 40: {
|
3160 | 0.995: 20.71,
|
3161 | 0.99: 22.16,
|
3162 | 0.975: 24.43,
|
3163 | 0.95: 26.51,
|
3164 | 0.9: 29.05,
|
3165 | 0.5: 39.34,
|
3166 | 0.1: 51.81,
|
3167 | 0.05: 55.76,
|
3168 | 0.025: 59.34,
|
3169 | 0.01: 63.69,
|
3170 | 0.005: 66.77
|
3171 | },
|
3172 | 50: {
|
3173 | 0.995: 27.99,
|
3174 | 0.99: 29.71,
|
3175 | 0.975: 32.36,
|
3176 | 0.95: 34.76,
|
3177 | 0.9: 37.69,
|
3178 | 0.5: 49.33,
|
3179 | 0.1: 63.17,
|
3180 | 0.05: 67.5,
|
3181 | 0.025: 71.42,
|
3182 | 0.01: 76.15,
|
3183 | 0.005: 79.49
|
3184 | },
|
3185 | 60: {
|
3186 | 0.995: 35.53,
|
3187 | 0.99: 37.48,
|
3188 | 0.975: 40.48,
|
3189 | 0.95: 43.19,
|
3190 | 0.9: 46.46,
|
3191 | 0.5: 59.33,
|
3192 | 0.1: 74.4,
|
3193 | 0.05: 79.08,
|
3194 | 0.025: 83.3,
|
3195 | 0.01: 88.38,
|
3196 | 0.005: 91.95
|
3197 | },
|
3198 | 70: {
|
3199 | 0.995: 43.28,
|
3200 | 0.99: 45.44,
|
3201 | 0.975: 48.76,
|
3202 | 0.95: 51.74,
|
3203 | 0.9: 55.33,
|
3204 | 0.5: 69.33,
|
3205 | 0.1: 85.53,
|
3206 | 0.05: 90.53,
|
3207 | 0.025: 95.02,
|
3208 | 0.01: 100.42,
|
3209 | 0.005: 104.22
|
3210 | },
|
3211 | 80: {
|
3212 | 0.995: 51.17,
|
3213 | 0.99: 53.54,
|
3214 | 0.975: 57.15,
|
3215 | 0.95: 60.39,
|
3216 | 0.9: 64.28,
|
3217 | 0.5: 79.33,
|
3218 | 0.1: 96.58,
|
3219 | 0.05: 101.88,
|
3220 | 0.025: 106.63,
|
3221 | 0.01: 112.33,
|
3222 | 0.005: 116.32
|
3223 | },
|
3224 | 90: {
|
3225 | 0.995: 59.2,
|
3226 | 0.99: 61.75,
|
3227 | 0.975: 65.65,
|
3228 | 0.95: 69.13,
|
3229 | 0.9: 73.29,
|
3230 | 0.5: 89.33,
|
3231 | 0.1: 107.57,
|
3232 | 0.05: 113.14,
|
3233 | 0.025: 118.14,
|
3234 | 0.01: 124.12,
|
3235 | 0.005: 128.3
|
3236 | },
|
3237 | 100: {
|
3238 | 0.995: 67.33,
|
3239 | 0.99: 70.06,
|
3240 | 0.975: 74.22,
|
3241 | 0.95: 77.93,
|
3242 | 0.9: 82.36,
|
3243 | 0.5: 99.33,
|
3244 | 0.1: 118.5,
|
3245 | 0.05: 124.34,
|
3246 | 0.025: 129.56,
|
3247 | 0.01: 135.81,
|
3248 | 0.005: 140.17
|
3249 | }
|
3250 | };
|
3251 |
|
3252 | /**
|
3253 | * The [χ2 (Chi-Squared) Goodness-of-Fit Test](http://en.wikipedia.org/wiki/Goodness_of_fit#Pearson.27s_chi-squared_test)
|
3254 | * uses a measure of goodness of fit which is the sum of differences between observed and expected outcome frequencies
|
3255 | * (that is, counts of observations), each squared and divided by the number of observations expected given the
|
3256 | * hypothesized distribution. The resulting χ2 statistic, `chiSquared`, can be compared to the chi-squared distribution
|
3257 | * to determine the goodness of fit. In order to determine the degrees of freedom of the chi-squared distribution, one
|
3258 | * takes the total number of observed frequencies and subtracts the number of estimated parameters. The test statistic
|
3259 | * follows, approximately, a chi-square distribution with (k − c) degrees of freedom where `k` is the number of non-empty
|
3260 | * cells and `c` is the number of estimated parameters for the distribution.
|
3261 | *
|
3262 | * @param {Array<number>} data
|
3263 | * @param {Function} distributionType a function that returns a point in a distribution:
|
3264 | * for instance, binomial, bernoulli, or poisson
|
3265 | * @param {number} significance
|
3266 | * @returns {number} chi squared goodness of fit
|
3267 | * @example
|
3268 | * // Data from Poisson goodness-of-fit example 10-19 in William W. Hines & Douglas C. Montgomery,
|
3269 | * // "Probability and Statistics in Engineering and Management Science", Wiley (1980).
|
3270 | * var data1019 = [
|
3271 | * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
3272 | * 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
3273 | * 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
3274 | * 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
3275 | * 3, 3, 3, 3
|
3276 | * ];
|
3277 | * ss.chiSquaredGoodnessOfFit(data1019, ss.poissonDistribution, 0.05); //= false
|
3278 | */
|
3279 | function chiSquaredGoodnessOfFit(data, distributionType, significance) {
|
3280 | // Estimate from the sample data, a weighted mean.
|
3281 | var inputMean = mean(data);
|
3282 | // Calculated value of the χ2 statistic.
|
3283 | var chiSquared = 0;
|
3284 | // Number of hypothesized distribution parameters estimated, expected to be supplied in the distribution test.
|
3285 | // Lose one degree of freedom for estimating `lambda` from the sample data.
|
3286 | var c = 1;
|
3287 | // The hypothesized distribution.
|
3288 | // Generate the hypothesized distribution.
|
3289 | var hypothesizedDistribution = distributionType(inputMean);
|
3290 | var observedFrequencies = [];
|
3291 | var expectedFrequencies = [];
|
3292 |
|
3293 | // Create an array holding a histogram from the sample data, of
|
3294 | // the form `{ value: numberOfOcurrences }`
|
3295 | for (var i = 0; i < data.length; i++) {
|
3296 | if (observedFrequencies[data[i]] === undefined) {
|
3297 | observedFrequencies[data[i]] = 0;
|
3298 | }
|
3299 | observedFrequencies[data[i]]++;
|
3300 | }
|
3301 |
|
3302 | // The histogram we created might be sparse - there might be gaps
|
3303 | // between values. So we iterate through the histogram, making
|
3304 | // sure that instead of undefined, gaps have 0 values.
|
3305 | for (var i$1 = 0; i$1 < observedFrequencies.length; i$1++) {
|
3306 | if (observedFrequencies[i$1] === undefined) {
|
3307 | observedFrequencies[i$1] = 0;
|
3308 | }
|
3309 | }
|
3310 |
|
3311 | // Create an array holding a histogram of expected data given the
|
3312 | // sample size and hypothesized distribution.
|
3313 | for (var k in hypothesizedDistribution) {
|
3314 | if (k in observedFrequencies) {
|
3315 | expectedFrequencies[+k] = hypothesizedDistribution[k] * data.length;
|
3316 | }
|
3317 | }
|
3318 |
|
3319 | // Working backward through the expected frequencies, collapse classes
|
3320 | // if less than three observations are expected for a class.
|
3321 | // This transformation is applied to the observed frequencies as well.
|
3322 | for (var k$1 = expectedFrequencies.length - 1; k$1 >= 0; k$1--) {
|
3323 | if (expectedFrequencies[k$1] < 3) {
|
3324 | expectedFrequencies[k$1 - 1] += expectedFrequencies[k$1];
|
3325 | expectedFrequencies.pop();
|
3326 |
|
3327 | observedFrequencies[k$1 - 1] += observedFrequencies[k$1];
|
3328 | observedFrequencies.pop();
|
3329 | }
|
3330 | }
|
3331 |
|
3332 | // Iterate through the squared differences between observed & expected
|
3333 | // frequencies, accumulating the `chiSquared` statistic.
|
3334 | for (var k$2 = 0; k$2 < observedFrequencies.length; k$2++) {
|
3335 | chiSquared +=
|
3336 | Math.pow(observedFrequencies[k$2] - expectedFrequencies[k$2], 2) /
|
3337 | expectedFrequencies[k$2];
|
3338 | }
|
3339 |
|
3340 | // Calculate degrees of freedom for this test and look it up in the
|
3341 | // `chiSquaredDistributionTable` in order to
|
3342 | // accept or reject the goodness-of-fit of the hypothesized distribution.
|
3343 | // Degrees of freedom, calculated as (number of class intervals -
|
3344 | // number of hypothesized distribution parameters estimated - 1)
|
3345 | var degreesOfFreedom = observedFrequencies.length - c - 1;
|
3346 | return (
|
3347 | chiSquaredDistributionTable[degreesOfFreedom][significance] < chiSquared
|
3348 | );
|
3349 | }
|
3350 |
|
3351 | var SQRT_2PI = Math.sqrt(2 * Math.PI);
|
3352 |
|
3353 | /**
|
3354 | * [Well-known kernels](https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use)
|
3355 | * @private
|
3356 | */
|
3357 | var kernels = {
|
3358 | /**
|
3359 | * The gaussian kernel.
|
3360 | * @private
|
3361 | */
|
3362 | gaussian: function (u) {
|
3363 | return Math.exp(-0.5 * u * u) / SQRT_2PI;
|
3364 | }
|
3365 | };
|
3366 |
|
3367 | /**
|
3368 | * Well known bandwidth selection methods
|
3369 | * @private
|
3370 | */
|
3371 | var bandwidthMethods = {
|
3372 | /**
|
3373 | * The ["normal reference distribution"
|
3374 | * rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html),
|
3375 | * a commonly used version of [Silverman's
|
3376 | * rule-of-thumb](https://en.wikipedia.org/wiki/Kernel_density_estimation#A_rule-of-thumb_bandwidth_estimator).
|
3377 | * @private
|
3378 | */
|
3379 | nrd: function (x) {
|
3380 | var s = sampleStandardDeviation(x);
|
3381 | var iqr = interquartileRange(x);
|
3382 | if (typeof iqr === "number") {
|
3383 | s = Math.min(s, iqr / 1.34);
|
3384 | }
|
3385 | return 1.06 * s * Math.pow(x.length, -0.2);
|
3386 | }
|
3387 | };
|
3388 |
|
3389 | /**
|
3390 | * [Kernel density estimation](https://en.wikipedia.org/wiki/Kernel_density_estimation)
|
3391 | * is a useful tool for, among other things, estimating the shape of the
|
3392 | * underlying probability distribution from a sample.
|
3393 | *
|
3394 | * @name kernelDensityEstimation
|
3395 | * @param X sample values
|
3396 | * @param kernel The kernel function to use. If a function is provided, it should return non-negative values and integrate to 1. Defaults to 'gaussian'.
|
3397 | * @param bandwidthMethod The "bandwidth selection" method to use, or a fixed bandwidth value. Defaults to "nrd", the commonly-used ["normal reference distribution" rule-of-thumb](https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/bandwidth.nrd.html).
|
3398 | * @returns {Function} An estimated [probability density function](https://en.wikipedia.org/wiki/Probability_density_function) for the given sample. The returned function runs in `O(X.length)`.
|
3399 | */
|
3400 | function kernelDensityEstimation(X, kernel, bandwidthMethod) {
|
3401 | var kernelFn;
|
3402 | if (kernel === undefined) {
|
3403 | kernelFn = kernels.gaussian;
|
3404 | } else if (typeof kernel === "string") {
|
3405 | if (!kernels[kernel]) {
|
3406 | throw new Error('Unknown kernel "' + kernel + '"');
|
3407 | }
|
3408 | kernelFn = kernels[kernel];
|
3409 | } else {
|
3410 | kernelFn = kernel;
|
3411 | }
|
3412 |
|
3413 | var bandwidth;
|
3414 | if (typeof bandwidthMethod === "undefined") {
|
3415 | bandwidth = bandwidthMethods.nrd(X);
|
3416 | } else if (typeof bandwidthMethod === "string") {
|
3417 | if (!bandwidthMethods[bandwidthMethod]) {
|
3418 | throw new Error(
|
3419 | 'Unknown bandwidth method "' + bandwidthMethod + '"'
|
3420 | );
|
3421 | }
|
3422 | bandwidth = bandwidthMethods[bandwidthMethod](X);
|
3423 | } else {
|
3424 | bandwidth = bandwidthMethod;
|
3425 | }
|
3426 |
|
3427 | return function (x) {
|
3428 | var i = 0;
|
3429 | var sum = 0;
|
3430 | for (i = 0; i < X.length; i++) {
|
3431 | sum += kernelFn((x - X[i]) / bandwidth);
|
3432 | }
|
3433 | return sum / bandwidth / X.length;
|
3434 | };
|
3435 | }
|
3436 |
|
3437 | /**
|
3438 | * The [Z-Score, or Standard Score](http://en.wikipedia.org/wiki/Standard_score).
|
3439 | *
|
3440 | * The standard score is the number of standard deviations an observation
|
3441 | * or datum is above or below the mean. Thus, a positive standard score
|
3442 | * represents a datum above the mean, while a negative standard score
|
3443 | * represents a datum below the mean. It is a dimensionless quantity
|
3444 | * obtained by subtracting the population mean from an individual raw
|
3445 | * score and then dividing the difference by the population standard
|
3446 | * deviation.
|
3447 | *
|
3448 | * The z-score is only defined if one knows the population parameters;
|
3449 | * if one only has a sample set, then the analogous computation with
|
3450 | * sample mean and sample standard deviation yields the
|
3451 | * Student's t-statistic.
|
3452 | *
|
3453 | * @param {number} x
|
3454 | * @param {number} mean
|
3455 | * @param {number} standardDeviation
|
3456 | * @return {number} z score
|
3457 | * @example
|
3458 | * zScore(78, 80, 5); // => -0.4
|
3459 | */
|
3460 | function zScore(x, mean, standardDeviation) {
|
3461 | return (x - mean) / standardDeviation;
|
3462 | }
|
3463 |
|
3464 | var SQRT_2PI$1 = Math.sqrt(2 * Math.PI);
|
3465 |
|
3466 | function cumulativeDistribution(z) {
|
3467 | var sum = z,
|
3468 | tmp = z;
|
3469 |
|
3470 | // 15 iterations are enough for 4-digit precision
|
3471 | for (var i = 1; i < 15; i++) {
|
3472 | tmp *= (z * z) / (2 * i + 1);
|
3473 | sum += tmp;
|
3474 | }
|
3475 | return (
|
3476 | Math.round((0.5 + (sum / SQRT_2PI$1) * Math.exp((-z * z) / 2)) * 1e4) /
|
3477 | 1e4
|
3478 | );
|
3479 | }
|
3480 |
|
3481 | /**
|
3482 | * A standard normal table, also called the unit normal table or Z table,
|
3483 | * is a mathematical table for the values of Φ (phi), which are the values of
|
3484 | * the cumulative distribution function of the normal distribution.
|
3485 | * It is used to find the probability that a statistic is observed below,
|
3486 | * above, or between values on the standard normal distribution, and by
|
3487 | * extension, any normal distribution.
|
3488 | *
|
3489 | * The probabilities are calculated using the
|
3490 | * [Cumulative distribution function](https://en.wikipedia.org/wiki/Normal_distribution#Cumulative_distribution_function).
|
3491 | * The table used is the cumulative, and not cumulative from 0 to mean
|
3492 | * (even though the latter has 5 digits precision, instead of 4).
|
3493 | */
|
3494 | var standardNormalTable = [];
|
3495 |
|
3496 | for (var z = 0; z <= 3.09; z += 0.01) {
|
3497 | standardNormalTable.push(cumulativeDistribution(z));
|
3498 | }
|
3499 |
|
3500 | /**
|
3501 | * **[Cumulative Standard Normal Probability](http://en.wikipedia.org/wiki/Standard_normal_table)**
|
3502 | *
|
3503 | * Since probability tables cannot be
|
3504 | * printed for every normal distribution, as there are an infinite variety
|
3505 | * of normal distributions, it is common practice to convert a normal to a
|
3506 | * standard normal and then use the standard normal table to find probabilities.
|
3507 | *
|
3508 | * You can use `.5 + .5 * errorFunction(x / Math.sqrt(2))` to calculate the probability
|
3509 | * instead of looking it up in a table.
|
3510 | *
|
3511 | * @param {number} z
|
3512 | * @returns {number} cumulative standard normal probability
|
3513 | */
|
3514 | function cumulativeStdNormalProbability(z) {
|
3515 | // Calculate the position of this value.
|
3516 | var absZ = Math.abs(z);
|
3517 | // Each row begins with a different
|
3518 | // significant digit: 0.5, 0.6, 0.7, and so on. Each value in the table
|
3519 | // corresponds to a range of 0.01 in the input values, so the value is
|
3520 | // multiplied by 100.
|
3521 | var index = Math.min(
|
3522 | Math.round(absZ * 100),
|
3523 | standardNormalTable.length - 1
|
3524 | );
|
3525 |
|
3526 | // The index we calculate must be in the table as a positive value,
|
3527 | // but we still pay attention to whether the input is positive
|
3528 | // or negative, and flip the output value as a last step.
|
3529 | if (z >= 0) {
|
3530 | return standardNormalTable[index];
|
3531 | } else {
|
3532 | // due to floating-point arithmetic, values in the table with
|
3533 | // 4 significant figures can nevertheless end up as repeating
|
3534 | // fractions when they're computed here.
|
3535 | return +(1 - standardNormalTable[index]).toFixed(4);
|
3536 | }
|
3537 | }
|
3538 |
|
3539 | /**
|
3540 | * **[Gaussian error function](http://en.wikipedia.org/wiki/Error_function)**
|
3541 | *
|
3542 | * The `errorFunction(x/(sd * Math.sqrt(2)))` is the probability that a value in a
|
3543 | * normal distribution with standard deviation sd is within x of the mean.
|
3544 | *
|
3545 | * This function returns a numerical approximation to the exact value.
|
3546 | * It uses Horner's method to evaluate the polynomial of τ (tau).
|
3547 | *
|
3548 | * @param {number} x input
|
3549 | * @return {number} error estimation
|
3550 | * @example
|
3551 | * errorFunction(1).toFixed(2); // => '0.84'
|
3552 | */
|
3553 | function errorFunction(x) {
|
3554 | var t = 1 / (1 + 0.5 * Math.abs(x));
|
3555 | var tau =
|
3556 | t *
|
3557 | Math.exp(
|
3558 | -x * x +
|
3559 | ((((((((0.17087277 * t - 0.82215223) * t + 1.48851587) * t -
|
3560 | 1.13520398) *
|
3561 | t +
|
3562 | 0.27886807) *
|
3563 | t -
|
3564 | 0.18628806) *
|
3565 | t +
|
3566 | 0.09678418) *
|
3567 | t +
|
3568 | 0.37409196) *
|
3569 | t +
|
3570 | 1.00002368) *
|
3571 | t -
|
3572 | 1.26551223
|
3573 | );
|
3574 | if (x >= 0) {
|
3575 | return 1 - tau;
|
3576 | } else {
|
3577 | return tau - 1;
|
3578 | }
|
3579 | }
|
3580 |
|
3581 | /**
|
3582 | * The Inverse [Gaussian error function](http://en.wikipedia.org/wiki/Error_function)
|
3583 | * returns a numerical approximation to the value that would have caused
|
3584 | * `errorFunction()` to return x.
|
3585 | *
|
3586 | * @param {number} x value of error function
|
3587 | * @returns {number} estimated inverted value
|
3588 | */
|
3589 | function inverseErrorFunction(x) {
|
3590 | var a = (8 * (Math.PI - 3)) / (3 * Math.PI * (4 - Math.PI));
|
3591 |
|
3592 | var inv = Math.sqrt(
|
3593 | Math.sqrt(
|
3594 | Math.pow(2 / (Math.PI * a) + Math.log(1 - x * x) / 2, 2) -
|
3595 | Math.log(1 - x * x) / a
|
3596 | ) -
|
3597 | (2 / (Math.PI * a) + Math.log(1 - x * x) / 2)
|
3598 | );
|
3599 |
|
3600 | if (x >= 0) {
|
3601 | return inv;
|
3602 | } else {
|
3603 | return -inv;
|
3604 | }
|
3605 | }
|
3606 |
|
3607 | /**
|
3608 | * The [Probit](http://en.wikipedia.org/wiki/Probit)
|
3609 | * is the inverse of cumulativeStdNormalProbability(),
|
3610 | * and is also known as the normal quantile function.
|
3611 | *
|
3612 | * It returns the number of standard deviations from the mean
|
3613 | * where the p'th quantile of values can be found in a normal distribution.
|
3614 | * So, for example, probit(0.5 + 0.6827/2) ≈ 1 because 68.27% of values are
|
3615 | * normally found within 1 standard deviation above or below the mean.
|
3616 | *
|
3617 | * @param {number} p
|
3618 | * @returns {number} probit
|
3619 | */
|
3620 | function probit(p) {
|
3621 | if (p === 0) {
|
3622 | p = epsilon;
|
3623 | } else if (p >= 1) {
|
3624 | p = 1 - epsilon;
|
3625 | }
|
3626 | return Math.sqrt(2) * inverseErrorFunction(2 * p - 1);
|
3627 | }
|
3628 |
|
3629 | /**
|
3630 | * Conducts a [permutation test](https://en.wikipedia.org/wiki/Resampling_(statistics)#Permutation_tests)
|
3631 | * to determine if two data sets are *significantly* different from each other, using
|
3632 | * the difference of means between the groups as the test statistic.
|
3633 | * The function allows for the following hypotheses:
|
3634 | * - two_tail = Null hypothesis: the two distributions are equal.
|
3635 | * - greater = Null hypothesis: observations from sampleX tend to be smaller than those from sampleY.
|
3636 | * - less = Null hypothesis: observations from sampleX tend to be greater than those from sampleY.
|
3637 | * [Learn more about one-tail vs two-tail tests.](https://en.wikipedia.org/wiki/One-_and_two-tailed_tests)
|
3638 | *
|
3639 | * @param {Array<number>} sampleX first dataset (e.g. treatment data)
|
3640 | * @param {Array<number>} sampleY second dataset (e.g. control data)
|
3641 | * @param {string} alternative alternative hypothesis, either 'two_sided' (default), 'greater', or 'less'
|
3642 | * @param {number} k number of values in permutation distribution.
|
3643 | * @param {Function} [randomSource=Math.random] an optional entropy source
|
3644 | * @returns {number} p-value The probability of observing the difference between groups (as or more extreme than what we did), assuming the null hypothesis.
|
3645 | *
|
3646 | * @example
|
3647 | * var control = [2, 5, 3, 6, 7, 2, 5];
|
3648 | * var treatment = [20, 5, 13, 12, 7, 2, 2];
|
3649 | * permutationTest(control, treatment); // ~0.1324
|
3650 | */
|
3651 | function permutationTest(sampleX, sampleY, alternative, k, randomSource) {
|
3652 | // Set default arguments
|
3653 | if (k === undefined) {
|
3654 | k = 10000;
|
3655 | }
|
3656 | if (alternative === undefined) {
|
3657 | alternative = "two_side";
|
3658 | }
|
3659 | if (
|
3660 | alternative !== "two_side" &&
|
3661 | alternative !== "greater" &&
|
3662 | alternative !== "less"
|
3663 | ) {
|
3664 | throw new Error(
|
3665 | "`alternative` must be either 'two_side', 'greater', or 'less'"
|
3666 | );
|
3667 | }
|
3668 |
|
3669 | // get means for each sample
|
3670 | var meanX = mean(sampleX);
|
3671 | var meanY = mean(sampleY);
|
3672 |
|
3673 | // calculate initial test statistic. This will be our point of comparison with
|
3674 | // the generated test statistics.
|
3675 | var testStatistic = meanX - meanY;
|
3676 |
|
3677 | // create test-statistic distribution
|
3678 | var testStatDsn = new Array(k);
|
3679 |
|
3680 | // combine datsets so we can easily shuffle later
|
3681 | var allData = sampleX.concat(sampleY);
|
3682 | var midIndex = Math.floor(allData.length / 2);
|
3683 |
|
3684 | for (var i = 0; i < k; i++) {
|
3685 | // 1. shuffle data assignments
|
3686 | shuffleInPlace(allData, randomSource);
|
3687 | var permLeft = allData.slice(0, midIndex);
|
3688 | var permRight = allData.slice(midIndex, allData.length);
|
3689 |
|
3690 | // 2.re-calculate test statistic
|
3691 | var permTestStatistic = mean(permLeft) - mean(permRight);
|
3692 |
|
3693 | // 3. store test statistic to build test statistic distribution
|
3694 | testStatDsn[i] = permTestStatistic;
|
3695 | }
|
3696 |
|
3697 | // Calculate p-value depending on alternative
|
3698 | // For this test, we calculate the percentage of 'extreme' test statistics (subject to our hypothesis)
|
3699 | // more info on permutation test p-value calculations: https://onlinecourses.science.psu.edu/stat464/node/35
|
3700 | var numExtremeTStats = 0;
|
3701 | if (alternative === "two_side") {
|
3702 | for (var i$1 = 0; i$1 <= k; i$1++) {
|
3703 | if (Math.abs(testStatDsn[i$1]) >= Math.abs(testStatistic)) {
|
3704 | numExtremeTStats += 1;
|
3705 | }
|
3706 | }
|
3707 | } else if (alternative === "greater") {
|
3708 | for (var i$2 = 0; i$2 <= k; i$2++) {
|
3709 | if (testStatDsn[i$2] >= testStatistic) {
|
3710 | numExtremeTStats += 1;
|
3711 | }
|
3712 | }
|
3713 | } else {
|
3714 | // alternative === 'less'
|
3715 | for (var i$3 = 0; i$3 <= k; i$3++) {
|
3716 | if (testStatDsn[i$3] <= testStatistic) {
|
3717 | numExtremeTStats += 1;
|
3718 | }
|
3719 | }
|
3720 | }
|
3721 |
|
3722 | return numExtremeTStats / k;
|
3723 | }
|
3724 |
|
3725 | /**
|
3726 | * [Sign](https://en.wikipedia.org/wiki/Sign_function) is a function
|
3727 | * that extracts the sign of a real number
|
3728 | *
|
3729 | * @param {number} x input value
|
3730 | * @returns {number} sign value either 1, 0 or -1
|
3731 | * @throws {TypeError} if the input argument x is not a number
|
3732 | * @private
|
3733 | *
|
3734 | * @example
|
3735 | * sign(2); // => 1
|
3736 | */
|
3737 | function sign(x) {
|
3738 | if (typeof x === "number") {
|
3739 | if (x < 0) {
|
3740 | return -1;
|
3741 | } else if (x === 0) {
|
3742 | return 0;
|
3743 | } else {
|
3744 | return 1;
|
3745 | }
|
3746 | } else {
|
3747 | throw new TypeError("not a number");
|
3748 | }
|
3749 | }
|
3750 |
|
3751 | /**
|
3752 | * [Bisection method](https://en.wikipedia.org/wiki/Bisection_method) is a root-finding
|
3753 | * method that repeatedly bisects an interval to find the root.
|
3754 | *
|
3755 | * This function returns a numerical approximation to the exact value.
|
3756 | *
|
3757 | * @param {Function} func input function
|
3758 | * @param {number} start - start of interval
|
3759 | * @param {number} end - end of interval
|
3760 | * @param {number} maxIterations - the maximum number of iterations
|
3761 | * @param {number} errorTolerance - the error tolerance
|
3762 | * @returns {number} estimated root value
|
3763 | * @throws {TypeError} Argument func must be a function
|
3764 | *
|
3765 | * @example
|
3766 | * bisect(Math.cos,0,4,100,0.003); // => 1.572265625
|
3767 | */
|
3768 | function bisect(func, start, end, maxIterations, errorTolerance) {
|
3769 | if (typeof func !== "function")
|
3770 | { throw new TypeError("func must be a function"); }
|
3771 |
|
3772 | for (var i = 0; i < maxIterations; i++) {
|
3773 | var output = (start + end) / 2;
|
3774 |
|
3775 | if (
|
3776 | func(output) === 0 ||
|
3777 | Math.abs((end - start) / 2) < errorTolerance
|
3778 | ) {
|
3779 | return output;
|
3780 | }
|
3781 |
|
3782 | if (sign(func(output)) === sign(func(start))) {
|
3783 | start = output;
|
3784 | } else {
|
3785 | end = output;
|
3786 | }
|
3787 | }
|
3788 |
|
3789 | throw new Error("maximum number of iterations exceeded");
|
3790 | }
|
3791 |
|
3792 | /**
|
3793 | * Calculate Euclidean distance between two points.
|
3794 | * @param {Array<number>} left First N-dimensional point.
|
3795 | * @param {Array<number>} right Second N-dimensional point.
|
3796 | * @returns {number} Distance.
|
3797 | */
|
3798 | function euclideanDistance(left, right) {
|
3799 | var sum = 0;
|
3800 | for (var i = 0; i < left.length; i++) {
|
3801 | var diff = left[i] - right[i];
|
3802 | sum += diff * diff;
|
3803 | }
|
3804 | return Math.sqrt(sum);
|
3805 | }
|
3806 |
|
3807 | /**
|
3808 | * @typedef {Object} kMeansReturn
|
3809 | * @property {Array<number>} labels The labels.
|
3810 | * @property {Array<Array<number>>} centroids The cluster centroids.
|
3811 | */
|
3812 |
|
3813 | /**
|
3814 | * Perform k-means clustering.
|
3815 | *
|
3816 | * @param {Array<Array<number>>} points N-dimensional coordinates of points to be clustered.
|
3817 | * @param {number} numCluster How many clusters to create.
|
3818 | * @param {Function} randomSource An optional entropy source that generates uniform values in [0, 1).
|
3819 | * @return {kMeansReturn} Labels (same length as data) and centroids (same length as numCluster).
|
3820 | * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
|
3821 | *
|
3822 | * @example
|
3823 | * kMeansCluster([[0.0, 0.5], [1.0, 0.5]], 2); // => {labels: [0, 1], centroids: [[0.0, 0.5], [1.0 0.5]]}
|
3824 | */
|
3825 | function kMeansCluster(points, numCluster, randomSource) {
|
3826 | if ( randomSource === void 0 ) randomSource = Math.random;
|
3827 |
|
3828 | var oldCentroids = null;
|
3829 | var newCentroids = sample(points, numCluster, randomSource);
|
3830 | var labels = null;
|
3831 | var change = Number.MAX_VALUE;
|
3832 | while (change !== 0) {
|
3833 | labels = labelPoints(points, newCentroids);
|
3834 | oldCentroids = newCentroids;
|
3835 | newCentroids = calculateCentroids(points, labels, numCluster);
|
3836 | change = calculateChange(newCentroids, oldCentroids);
|
3837 | }
|
3838 | return {
|
3839 | labels: labels,
|
3840 | centroids: newCentroids
|
3841 | };
|
3842 | }
|
3843 |
|
3844 | /**
|
3845 | * Label each point according to which centroid it is closest to.
|
3846 | *
|
3847 | * @private
|
3848 | * @param {Array<Array<number>>} points Array of XY coordinates.
|
3849 | * @param {Array<Array<number>>} centroids Current centroids.
|
3850 | * @return {Array<number>} Group labels.
|
3851 | */
|
3852 | function labelPoints(points, centroids) {
|
3853 | return points.map(function (p) {
|
3854 | var minDist = Number.MAX_VALUE;
|
3855 | var label = -1;
|
3856 | for (var i = 0; i < centroids.length; i++) {
|
3857 | var dist = euclideanDistance(p, centroids[i]);
|
3858 | if (dist < minDist) {
|
3859 | minDist = dist;
|
3860 | label = i;
|
3861 | }
|
3862 | }
|
3863 | return label;
|
3864 | });
|
3865 | }
|
3866 |
|
3867 | /**
|
3868 | * Calculate centroids for points given labels.
|
3869 | *
|
3870 | * @private
|
3871 | * @param {Array<Array<number>>} points Array of XY coordinates.
|
3872 | * @param {Array<number>} labels Which groups points belong to.
|
3873 | * @param {number} numCluster Number of clusters being created.
|
3874 | * @return {Array<Array<number>>} Centroid for each group.
|
3875 | * @throws {Error} If any centroids wind up friendless (i.e., without associated points).
|
3876 | */
|
3877 | function calculateCentroids(points, labels, numCluster) {
|
3878 | // Initialize accumulators.
|
3879 | var dimension = points[0].length;
|
3880 | var centroids = makeMatrix(numCluster, dimension);
|
3881 | var counts = Array(numCluster).fill(0);
|
3882 |
|
3883 | // Add points to centroids' accumulators and count points per centroid.
|
3884 | var numPoints = points.length;
|
3885 | for (var i = 0; i < numPoints; i++) {
|
3886 | var point = points[i];
|
3887 | var label = labels[i];
|
3888 | var current = centroids[label];
|
3889 | for (var j = 0; j < dimension; j++) {
|
3890 | current[j] += point[j];
|
3891 | }
|
3892 | counts[label] += 1;
|
3893 | }
|
3894 |
|
3895 | // Rescale centroids, checking for any that have no points.
|
3896 | for (var i$1 = 0; i$1 < numCluster; i$1++) {
|
3897 | if (counts[i$1] === 0) {
|
3898 | throw new Error(("Centroid " + i$1 + " has no friends"));
|
3899 | }
|
3900 | var centroid = centroids[i$1];
|
3901 | for (var j$1 = 0; j$1 < dimension; j$1++) {
|
3902 | centroid[j$1] /= counts[i$1];
|
3903 | }
|
3904 | }
|
3905 |
|
3906 | return centroids;
|
3907 | }
|
3908 |
|
3909 | /**
|
3910 | * Calculate the difference between old centroids and new centroids.
|
3911 | *
|
3912 | * @private
|
3913 | * @param {Array<Array<number>>} left One list of centroids.
|
3914 | * @param {Array<Array<number>>} right Another list of centroids.
|
3915 | * @return {number} Distance between centroids.
|
3916 | */
|
3917 | function calculateChange(left, right) {
|
3918 | var total = 0;
|
3919 | for (var i = 0; i < left.length; i++) {
|
3920 | total += euclideanDistance(left[i], right[i]);
|
3921 | }
|
3922 | return total;
|
3923 | }
|
3924 |
|
3925 | /**
|
3926 | * Calculate the [silhouette values](https://en.wikipedia.org/wiki/Silhouette_(clustering))
|
3927 | * for clustered data.
|
3928 | *
|
3929 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
3930 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
3931 | * and values must lie in [0..G-1], where G is the number of groups.
|
3932 | * @return {Array<number>} The silhouette value for each point.
|
3933 | *
|
3934 | * @example
|
3935 | * silhouette([[0.25], [0.75]], [0, 0]); // => [1.0, 1.0]
|
3936 | */
|
3937 | function silhouette(points, labels) {
|
3938 | if (points.length !== labels.length) {
|
3939 | throw new Error("must have exactly as many labels as points");
|
3940 | }
|
3941 | var groupings = createGroups(labels);
|
3942 | var distances = calculateAllDistances(points);
|
3943 | var result = [];
|
3944 | for (var i = 0; i < points.length; i++) {
|
3945 | var s = 0;
|
3946 | if (groupings[labels[i]].length > 1) {
|
3947 | var a = meanDistanceFromPointToGroup(
|
3948 | i,
|
3949 | groupings[labels[i]],
|
3950 | distances
|
3951 | );
|
3952 | var b = meanDistanceToNearestGroup(
|
3953 | i,
|
3954 | labels,
|
3955 | groupings,
|
3956 | distances
|
3957 | );
|
3958 | s = (b - a) / Math.max(a, b);
|
3959 | }
|
3960 | result.push(s);
|
3961 | }
|
3962 | return result;
|
3963 | }
|
3964 |
|
3965 | /**
|
3966 | * Create a lookup table mapping group IDs to point IDs.
|
3967 | *
|
3968 | * @private
|
3969 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
3970 | * and values must lie in [0..G-1], where G is the number of groups.
|
3971 | * @return {Array<Array<number>>} An array of length G, each of whose entries is an array
|
3972 | * containing the indices of the points in that group.
|
3973 | */
|
3974 | function createGroups(labels) {
|
3975 | var numGroups = 1 + max(labels);
|
3976 | var result = Array(numGroups);
|
3977 | for (var i = 0; i < labels.length; i++) {
|
3978 | var label = labels[i];
|
3979 | if (result[label] === undefined) {
|
3980 | result[label] = [];
|
3981 | }
|
3982 | result[label].push(i);
|
3983 | }
|
3984 | return result;
|
3985 | }
|
3986 |
|
3987 | /**
|
3988 | * Create a lookup table of all inter-point distances.
|
3989 | *
|
3990 | * @private
|
3991 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
3992 | * @return {Array<Array<number>>} A symmetric square array of inter-point distances
|
3993 | * (zero on the diagonal).
|
3994 | */
|
3995 | function calculateAllDistances(points) {
|
3996 | var numPoints = points.length;
|
3997 | var result = makeMatrix(numPoints, numPoints);
|
3998 | for (var i = 0; i < numPoints; i++) {
|
3999 | for (var j = 0; j < i; j++) {
|
4000 | result[i][j] = euclideanDistance(points[i], points[j]);
|
4001 | result[j][i] = result[i][j];
|
4002 | }
|
4003 | }
|
4004 | return result;
|
4005 | }
|
4006 |
|
4007 | /**
|
4008 | * Calculate the mean distance between this point and all the points in the
|
4009 | * nearest group (as determined by which point in another group is closest).
|
4010 | *
|
4011 | * @private
|
4012 | * @param {number} which The index of this point.
|
4013 | * @param {Array<number>} labels Labels of points.
|
4014 | * @param {Array<Array<number>>} groupings An array whose entries are arrays
|
4015 | * containing the indices of the points in that group.
|
4016 | * @param {Array<Array<number>>} distances A symmetric square array of inter-point
|
4017 | * distances.
|
4018 | * @return {number} The mean distance from this point to others in the nearest
|
4019 | * group.
|
4020 | */
|
4021 | function meanDistanceToNearestGroup(which, labels, groupings, distances) {
|
4022 | var label = labels[which];
|
4023 | var result = Number.MAX_VALUE;
|
4024 | for (var i = 0; i < groupings.length; i++) {
|
4025 | if (i !== label) {
|
4026 | var d = meanDistanceFromPointToGroup(
|
4027 | which,
|
4028 | groupings[i],
|
4029 | distances
|
4030 | );
|
4031 | if (d < result) {
|
4032 | result = d;
|
4033 | }
|
4034 | }
|
4035 | }
|
4036 | return result;
|
4037 | }
|
4038 |
|
4039 | /**
|
4040 | * Calculate the mean distance between a point and all the points in a group
|
4041 | * (possibly its own).
|
4042 | *
|
4043 | * @private
|
4044 | * @param {number} which The index of this point.
|
4045 | * @param {Array<number>} group The indices of all the points in the group in
|
4046 | * question.
|
4047 | * @param {Array<Array<number>>} distances A symmetric square array of inter-point
|
4048 | * distances.
|
4049 | * @return {number} The mean distance from this point to others in the
|
4050 | * specified group.
|
4051 | */
|
4052 | function meanDistanceFromPointToGroup(which, group, distances) {
|
4053 | var total = 0;
|
4054 | for (var i = 0; i < group.length; i++) {
|
4055 | total += distances[which][group[i]];
|
4056 | }
|
4057 | return total / group.length;
|
4058 | }
|
4059 |
|
4060 | /**
|
4061 | * Calculate the [silhouette metric](https://en.wikipedia.org/wiki/Silhouette_(clustering))
|
4062 | * for a set of N-dimensional points arranged in groups. The metric is the largest
|
4063 | * individual silhouette value for the data.
|
4064 | *
|
4065 | * @param {Array<Array<number>>} points N-dimensional coordinates of points.
|
4066 | * @param {Array<number>} labels Labels of points. This must be the same length as `points`,
|
4067 | * and values must lie in [0..G-1], where G is the number of groups.
|
4068 | * @return {number} The silhouette metric for the groupings.
|
4069 | *
|
4070 | * @example
|
4071 | * silhouetteMetric([[0.25], [0.75]], [0, 0]); // => 1.0
|
4072 | */
|
4073 | function silhouetteMetric(points, labels) {
|
4074 | var values = silhouette(points, labels);
|
4075 | return max(values);
|
4076 | }
|
4077 |
|
4078 | /**
|
4079 | * Relative error.
|
4080 | *
|
4081 | * This is more difficult to calculate than it first appears [1,2]. The usual
|
4082 | * formula for the relative error between an actual value A and an expected
|
4083 | * value E is `|(A-E)/E|`, but:
|
4084 | *
|
4085 | * 1. If the expected value is 0, any other value has infinite relative error,
|
4086 | * which is counter-intuitive: if the expected voltage is 0, getting 1/10th
|
4087 | * of a volt doesn't feel like an infinitely large error.
|
4088 | *
|
4089 | * 2. This formula does not satisfy the mathematical definition of a metric [3].
|
4090 | * [4] solved this problem by defining the relative error as `|ln(|A/E|)|`,
|
4091 | * but that formula only works if all values are positive: for example, it
|
4092 | * reports the relative error of -10 and 10 as 0.
|
4093 | *
|
4094 | * Our implementation sticks with convention and returns:
|
4095 | *
|
4096 | * - 0 if the actual and expected values are both zero
|
4097 | * - Infinity if the actual value is non-zero and the expected value is zero
|
4098 | * - `|(A-E)/E|` in all other cases
|
4099 | *
|
4100 | * [1] https://math.stackexchange.com/questions/677852/how-to-calculate-relative-error-when-true-value-is-zero
|
4101 | * [2] https://en.wikipedia.org/wiki/Relative_change_and_difference
|
4102 | * [3] https://en.wikipedia.org/wiki/Metric_(mathematics)#Definition
|
4103 | * [4] F.W.J. Olver: "A New Approach to Error Arithmetic." SIAM Journal on
|
4104 | * Numerical Analysis, 15(2), 1978, 10.1137/0715024.
|
4105 | *
|
4106 | * @param {number} actual The actual value.
|
4107 | * @param {number} expected The expected value.
|
4108 | * @return {number} The relative error.
|
4109 | */
|
4110 | function relativeError(actual, expected) {
|
4111 | if (actual === 0 && expected === 0) {
|
4112 | return 0;
|
4113 | }
|
4114 | return Math.abs((actual - expected) / expected);
|
4115 | }
|
4116 |
|
4117 | /**
|
4118 | * Approximate equality.
|
4119 | *
|
4120 | * @param {number} actual The value to be tested.
|
4121 | * @param {number} expected The reference value.
|
4122 | * @param {number} tolerance The acceptable relative difference.
|
4123 | * @return {boolean} Whether numbers are within tolerance.
|
4124 | */
|
4125 | function approxEqual(actual, expected, tolerance) {
|
4126 | if ( tolerance === void 0 ) tolerance = epsilon;
|
4127 |
|
4128 | return relativeError(actual, expected) <= tolerance;
|
4129 | }
|
4130 |
|
4131 | export { BayesianClassifier, PerceptronModel, addToMean, approxEqual, mean as average, meanSimple as averageSimple, BayesianClassifier as bayesian, bernoulliDistribution, binomialDistribution, bisect, chiSquaredDistributionTable, chiSquaredGoodnessOfFit, chunk, ckmeans, combinations, combinationsReplacement, combineMeans, combineVariances, cumulativeStdNormalProbability, epsilon, equalIntervalBreaks, errorFunction as erf, errorFunction, extent, extentSorted, factorial, gamma, gammaln, geometricMean, harmonicMean, interquartileRange, inverseErrorFunction, interquartileRange as iqr, kMeansCluster, kernelDensityEstimation as kde, kernelDensityEstimation, linearRegression, linearRegressionLine, medianAbsoluteDeviation as mad, max, maxSorted, mean, meanSimple, median, medianAbsoluteDeviation, medianSorted, min, minSorted, mode, modeFast, modeSorted, numericSort, PerceptronModel as perceptron, permutationTest, permutationsHeap, poissonDistribution, probit, product, quantile, quantileRank, quantileRankSorted, quantileSorted, quickselect, rSquared, relativeError, rootMeanSquare as rms, rootMeanSquare, sample, sampleCorrelation, sampleCovariance, sampleKurtosis, sampleSkewness, sampleStandardDeviation, sampleVariance, sampleWithReplacement, shuffle, shuffleInPlace, sign, silhouette, silhouetteMetric, standardDeviation, standardNormalTable, subtractFromMean, sum, sumNthPowerDeviations, sumSimple, tTest, tTestTwoSample, uniqueCountSorted, variance, zScore };
|
4132 | //# sourceMappingURL=simple-statistics.mjs.map
|