UNPKG

@nearform/doctor/analysis/guess-interval.js

Version:

6.7 kBJavaScriptView Raw

1'use strict'
2
3const ttest = require('ttest')
4const atol = 1e-9
5
6function guessInterval (data) {
// This function seperates handle sequence up into three partitions.
// This is to guess the benchmarking interval, in a typical benchmark the
// handle curve will look like:
//
// handles
// |
// |       |^^^^^|
// |  -----|     |-----
// ------------------------ time
//
// The lines can be very flat but we need to allow for some variation.
// The problem of partition generation is thus treated as piecewise least
// square regression problem, with unknown partitions.
// There are no good solutions to this problem, but since it is a 1D problem
// and we can constrain the curves to be flat. The solution can be brute
// forced in O(n^2) time. Doing this does require an online mean variance
// algorithm, thus the Welford algorithm is used together with an derived
// inverse Welford algorithm.
const left = new OnlineMeanVariance()
const middle = new OnlineMeanVariance()
const right = new OnlineMeanVariance()
28
// For performance, restructure the data into a dense array
const handles = data.map((d) => d.handles)
31
// Start by assigning all observations to the middle partition, it will
// have the interval [0, handles.length]
for (const datum of handles) {
  middle.add(datum)
}
37
// Use the current partion, of everything being assigning to the middle
// as the initial solution.
let bestTotalError = middle.sse
let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)
let bestRightIndex = handles.length // corresponds to slice(righIndex)
43
// Brute force all valid combinations of leftIndex and rightIndex
// Middle should always include at least one value, so don't scan to
// the end but instead `handles.length - 1`.
for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {
  // set interval to [leftIndex, handles.length]
  if (leftIndex >= 0) {
    left.add(handles[leftIndex])
    middle.remove(handles[leftIndex])
  }
  right.reset()
54
  // try all valid rightIndex values
  // because the middle is going to mutate as we scan from the right, save
  // the middle stat for the interval [leftIndex, handles.length]
  const oldMiddleState = middle.save()
  // Middle should always include at least one value, so don't scan to
  // leftIndex but instead `leftIndex + 1`.
  for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {
    // set interval to [leftIndex, rightIndex]
    if (rightIndex < handles.length) {
      right.add(handles[rightIndex])
      middle.remove(handles[rightIndex])
    }
67
    // check for improvement, improvement must exceede the absolute tolerance
    if (middle.size >= 2 &&
        bestTotalError > left.sse + middle.sse + right.sse + atol &&
        statisticallyLessThan(left, middle) &&
        statisticallyLessThan(right, middle)) {
      bestTotalError = left.sse + middle.sse + right.sse
      bestLeftIndex = leftIndex + 1
      bestRightIndex = rightIndex
    }
  }
78
  // In prepeation for next iteration, restore the middle stat to the
  // interval [leftIndex, handles.length]
  middle.load(oldMiddleState)
}
83
return [bestLeftIndex, bestRightIndex]
85}
86
87module.exports = guessInterval
88
89function statisticallyLessThan (leftSide, rightSide) {
// Since handles are in steps of 1, looking for differences less than
// 1 doesn't really make sense. The intervals are of course means and thus
// the a valid difference could be less than one. But in practice we are
// at a benchmarked server, thus the difference will always be much larger.
const mindiff = 1
95
// if the variance is very small or couldn't be estimated, just default
// to a naive mean comparison.
if (leftSide.variance < atol || rightSide.variance < atol ||
    Number.isNaN(leftSide.variance) || Number.isNaN(rightSide.variance)) {
  return leftSide.mean + mindiff < rightSide.mean
}
102
// variance is high, use a statistical t-test to check if there is a
// difference
const t = ttest(leftSide, rightSide, {
  alpha: 0.01,
  alternative: 'less',
  mu: -mindiff
})
return !t.valid()
111}
112
113// If you need to brush up on your online mean variance algorithms then take
114// a look at:
115//  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
116// This is required for O(n^2) exact optimization of the interval. If a non
117// online-variant was used, the complexity would be O(n^3).
118class OnlineMeanVariance {
constructor () {
  this.reset()
}
122
reset () {
  this.size = 0
  this.mean = 0
  this.sse = 0 // sum of square errors
}
128
save () {
  return {
    size: this.size,
    mean: this.mean,
    sse: this.sse
  }
}
136
load (save) {
  this.size = save.size
  this.mean = save.mean
  this.sse = save.sse
}
142
// Welford online algorithm is as follow:
// - defined is:
// δ_1 = x_n - µ_{n-1}
// δ_2 = x_n - µ_n
// Note that these will become computationally avaliable as they are needed.
// This is due how to the online µ changes durring computation.
149
// - Then compute:
// ~ δ_1 is now avaliable
// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
// µ_n = µ_{n-1} + δ_2 / n
// ~ δ_2 is now avaliable
// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
// S_n = S_{n-1} + δ_1 * δ_2
add (x) {
  this.size += 1
  const delta1 = x - this.mean
  this.mean += delta1 / this.size
  const delta2 = x - this.mean
  this.sse += delta1 * delta2
}
164
// This shows the inverse equations derived from the Welford operations
// ~ δ_2 is now avaliable
// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
// µ_{n-1} =  µ_n - (x_n - µ_{n-1}) / n
// n µ_{n-1} =  n µ_n - x_n + µ_{n-1}
// (n - 1) µ_{n-1} = n µ_n - x_n
// (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n
// µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)
// µ_{n-1} = µ_n - δ_2 / (n - 1)
// ~ δ_1 is now avaliable
// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
// S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)
// S_{n-1} = S_n - δ_1 * δ_2
remove (x) {
  this.size -= 1
  const delta2 = x - this.mean
  this.mean -= delta2 / this.size
  const delta1 = x - this.mean
  this.sse -= delta2 * delta1
}
185
// This function is only used for the t-test. We want to minimize the MSE of
// the curve fit. Since the number of observations stays constant, there is no
// need calculate the extra divition in the scans. Thus the SSE can be used
// directly.
// σ_n = S_n / (n - 1)
get variance () {
  if (this.size < 2) {
    return NaN
  } else {
    return this.sse / (this.size - 1)
  }
}
198}

1	`'use strict'`
2
3	`const ttest = require('ttest')`
4	`const atol = 1e-9`
5
6	`function guessInterval (data) {`
7	`// This function seperates handle sequence up into three partitions.`
8	`// This is to guess the benchmarking interval, in a typical benchmark the`
9	`// handle curve will look like:`
10	`//`
11	`// handles`
12	`// \|`
13	`// \| \|^^^^^\|`
14	`// \| -----\| \|-----`
15	`// ------------------------ time`
16	`//`
17	`// The lines can be very flat but we need to allow for some variation.`
18	`// The problem of partition generation is thus treated as piecewise least`
19	`// square regression problem, with unknown partitions.`
20	`// There are no good solutions to this problem, but since it is a 1D problem`
21	`// and we can constrain the curves to be flat. The solution can be brute`
22	`// forced in O(n^2) time. Doing this does require an online mean variance`
23	`// algorithm, thus the Welford algorithm is used together with an derived`
24	`// inverse Welford algorithm.`
25	`const left = new OnlineMeanVariance()`
26	`const middle = new OnlineMeanVariance()`
27	`const right = new OnlineMeanVariance()`
28
29	`// For performance, restructure the data into a dense array`
30	`const handles = data.map((d) => d.handles)`
31
32	`// Start by assigning all observations to the middle partition, it will`
33	`// have the interval [0, handles.length]`
34	`for (const datum of handles) {`
35	`middle.add(datum)`
36	`}`
37
38	`// Use the current partion, of everything being assigning to the middle`
39	`// as the initial solution.`
40	`let bestTotalError = middle.sse`
41	`let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)`
42	`let bestRightIndex = handles.length // corresponds to slice(righIndex)`
43
44	`// Brute force all valid combinations of leftIndex and rightIndex`
45	`// Middle should always include at least one value, so don't scan to`
46	// the end but instead `handles.length - 1`.
47	`for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {`
48	`// set interval to [leftIndex, handles.length]`
49	`if (leftIndex >= 0) {`
50	`left.add(handles[leftIndex])`
51	`middle.remove(handles[leftIndex])`
52	`}`
53	`right.reset()`
54
55	`// try all valid rightIndex values`
56	`// because the middle is going to mutate as we scan from the right, save`
57	`// the middle stat for the interval [leftIndex, handles.length]`
58	`const oldMiddleState = middle.save()`
59	`// Middle should always include at least one value, so don't scan to`
60	// leftIndex but instead `leftIndex + 1`.
61	`for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {`
62	`// set interval to [leftIndex, rightIndex]`
63	`if (rightIndex < handles.length) {`
64	`right.add(handles[rightIndex])`
65	`middle.remove(handles[rightIndex])`
66	`}`
67
68	`// check for improvement, improvement must exceede the absolute tolerance`
69	`if (middle.size >= 2 &&`
70	`bestTotalError > left.sse + middle.sse + right.sse + atol &&`
71	`statisticallyLessThan(left, middle) &&`
72	`statisticallyLessThan(right, middle)) {`
73	`bestTotalError = left.sse + middle.sse + right.sse`
74	`bestLeftIndex = leftIndex + 1`
75	`bestRightIndex = rightIndex`
76	`}`
77	`}`
78
79	`// In prepeation for next iteration, restore the middle stat to the`
80	`// interval [leftIndex, handles.length]`
81	`middle.load(oldMiddleState)`
82	`}`
83
84	`return [bestLeftIndex, bestRightIndex]`
85	`}`
86
87	`module.exports = guessInterval`
88
89	`function statisticallyLessThan (leftSide, rightSide) {`
90	`// Since handles are in steps of 1, looking for differences less than`
91	`// 1 doesn't really make sense. The intervals are of course means and thus`
92	`// the a valid difference could be less than one. But in practice we are`
93	`// at a benchmarked server, thus the difference will always be much larger.`
94	`const mindiff = 1`
95
96	`// if the variance is very small or couldn't be estimated, just default`
97	`// to a naive mean comparison.`
98	`if (leftSide.variance < atol \|\| rightSide.variance < atol \|\|`
99	`Number.isNaN(leftSide.variance) \|\| Number.isNaN(rightSide.variance)) {`
100	`return leftSide.mean + mindiff < rightSide.mean`
101	`}`
102
103	`// variance is high, use a statistical t-test to check if there is a`
104	`// difference`
105	`const t = ttest(leftSide, rightSide, {`
106	`alpha: 0.01,`
107	`alternative: 'less',`
108	`mu: -mindiff`
109	`})`
110	`return !t.valid()`
111	`}`
112
113	`// If you need to brush up on your online mean variance algorithms then take`
114	`// a look at:`
115	`// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance`
116	`// This is required for O(n^2) exact optimization of the interval. If a non`
117	`// online-variant was used, the complexity would be O(n^3).`
118	`class OnlineMeanVariance {`
119	`constructor () {`
120	`this.reset()`
121	`}`
122
123	`reset () {`
124	`this.size = 0`
125	`this.mean = 0`
126	`this.sse = 0 // sum of square errors`
127	`}`
128
129	`save () {`
130	`return {`
131	`size: this.size,`
132	`mean: this.mean,`
133	`sse: this.sse`
134	`}`
135	`}`
136
137	`load (save) {`
138	`this.size = save.size`
139	`this.mean = save.mean`
140	`this.sse = save.sse`
141	`}`
142
143	`// Welford online algorithm is as follow:`
144	`// - defined is:`
145	`// δ_1 = x_n - µ_{n-1}`
146	`// δ_2 = x_n - µ_n`
147	`// Note that these will become computationally avaliable as they are needed.`
148	`// This is due how to the online µ changes durring computation.`
149
150	`// - Then compute:`
151	`// ~ δ_1 is now avaliable`
152	`// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n`
153	`// µ_n = µ_{n-1} + δ_2 / n`
154	`// ~ δ_2 is now avaliable`
155	`// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)`
156	`// S_n = S_{n-1} + δ_1 * δ_2`
157	`add (x) {`
158	`this.size += 1`
159	`const delta1 = x - this.mean`
160	`this.mean += delta1 / this.size`
161	`const delta2 = x - this.mean`
162	`this.sse += delta1 * delta2`
163	`}`
164
165	`// This shows the inverse equations derived from the Welford operations`
166	`// ~ δ_2 is now avaliable`
167	`// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n`
168	`// µ_{n-1} = µ_n - (x_n - µ_{n-1}) / n`
169	`// n µ_{n-1} = n µ_n - x_n + µ_{n-1}`
170	`// (n - 1) µ_{n-1} = n µ_n - x_n`
171	`// (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n`
172	`// µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)`
173	`// µ_{n-1} = µ_n - δ_2 / (n - 1)`
174	`// ~ δ_1 is now avaliable`
175	`// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)`
176	`// S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)`
177	`// S_{n-1} = S_n - δ_1 * δ_2`
178	`remove (x) {`
179	`this.size -= 1`
180	`const delta2 = x - this.mean`
181	`this.mean -= delta2 / this.size`
182	`const delta1 = x - this.mean`
183	`this.sse -= delta2 * delta1`
184	`}`
185
186	`// This function is only used for the t-test. We want to minimize the MSE of`
187	`// the curve fit. Since the number of observations stays constant, there is no`
188	`// need calculate the extra divition in the scans. Thus the SSE can be used`
189	`// directly.`
190	`// σ_n = S_n / (n - 1)`
191	`get variance () {`
192	`if (this.size < 2) {`
193	`return NaN`
194	`} else {`
195	`return this.sse / (this.size - 1)`
196	`}`
197	`}`
198	`}`