UNPKG

@nearform/doctor/analysis/guess-interval.js

Version:

7.64 kBJavaScriptView Raw

1'use strict'
2
3const ttest = require('ttest')
4const atol = 1e-9
5
6// The main source of handle variance should come from the middle part that
7// contains the benchmark data. To bias the interval guessing, such that
8// the middle part contains the most variance, its sum-squared-error (sse)
9// is considered lower than the left and right parts.
10const varianceBias = 0.25 // (1/2)**2
11
12const trimTime = 100 // ms
13
14function guessInterval (data) {
// This function seperates handle sequence up into three partitions.
// This is to guess the benchmarking interval, in a typical benchmark the
// handle curve will look like:
//
// handles
// |
// |       |^^^^^|
// |  -----|     |-----
// ------------------------ time
//
// The lines can be very flat but we need to allow for some variation.
// The problem of partition generation is thus treated as piecewise least
// square regression problem, with unknown partitions.
// There are no good solutions to this problem, but since it is a 1D problem
// and we can constrain the curves to be flat. The solution can be brute
// forced in O(n^2) time. Doing this does require an online mean variance
// algorithm, thus the Welford algorithm is used together with an derived
// inverse Welford algorithm.
const left = new OnlineMeanVariance()
const middle = new OnlineMeanVariance()
const right = new OnlineMeanVariance()
36
// For performance, restructure the data into a dense array
const timestamps = data.map((d) => d.timestamp)
const handles = data.map((d) => d.handles)
40
// Start by assigning all observations to the middle partition, it will
// have the interval [0, handles.length]
for (const datum of handles) {
  middle.add(datum)
}
46
// Use the current partion, of everything being assigning to the middle
// as the initial solution.
let bestTotalError = middle.sse
let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)
let bestRightIndex = handles.length // corresponds to slice(righIndex)
52
// Brute force all valid combinations of leftIndex and rightIndex
// Middle should always include at least one value, so don't scan to
// the end but instead `handles.length - 1`.
for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {
  // set interval to [leftIndex, handles.length]
  if (leftIndex >= 0) {
    left.add(handles[leftIndex])
    middle.remove(handles[leftIndex])
  }
  right.reset()
63
  // try all valid rightIndex values
  // because the middle is going to mutate as we scan from the right, save
  // the middle stat for the interval [leftIndex, handles.length]
  const oldMiddleState = middle.save()
  // Middle should always include at least one value, so don't scan to
  // leftIndex but instead `leftIndex + 1`.
  for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {
    // set interval to [leftIndex, rightIndex]
    if (rightIndex < handles.length) {
      right.add(handles[rightIndex])
      middle.remove(handles[rightIndex])
    }
76
    // check for improvement, improvement must exceede the absolute tolerance
    if (middle.size >= 2 &&
        bestTotalError > left.sse + varianceBias * middle.sse + right.sse + atol &&
        statisticallyLessThan(left, middle) &&
        statisticallyLessThan(right, middle)) {
      bestTotalError = left.sse + varianceBias * middle.sse + right.sse
      bestLeftIndex = leftIndex + 1
      bestRightIndex = rightIndex
    }
  }
87
  // In prepeation for next iteration, restore the middle stat to the
  // interval [leftIndex, handles.length]
  middle.load(oldMiddleState)
}
92
// Trim the left and right index by `trimTime` milliseconds, but not more
const leftTime = timestamps[bestLeftIndex]
for (let i = bestLeftIndex; i <= bestRightIndex - 1; i++) {
  if (timestamps[i] - leftTime >= trimTime) {
    break
  }
  bestLeftIndex = i
}
101
const rightTime = timestamps[bestRightIndex - 1]
for (let i = bestRightIndex; i >= bestLeftIndex; i--) {
  if (rightTime - timestamps[i - 1] >= trimTime) {
    break
  }
  bestRightIndex = i
}
109
// Return the .slice value
return [bestLeftIndex, bestRightIndex]
112}
113
114module.exports = guessInterval
115
116function statisticallyLessThan (leftSide, rightSide) {
// Since handles are in steps of 1, looking for differences less than
// 1 doesn't really make sense. The intervals are of course means and thus
// the a valid difference could be less than one. But in practice we are
// at a benchmarked server, thus the difference will always be much larger.
const mindiff = 1
122
// if the variance is very small or couldn't be estimated, just default
// to a naive mean comparison.
if (leftSide.variance < atol || rightSide.variance < atol ||
    Number.isNaN(leftSide.variance) || Number.isNaN(rightSide.variance)) {
  return leftSide.mean + mindiff < rightSide.mean
}
129
// variance is high, use a statistical t-test to check if there is a
// difference
const t = ttest(leftSide, rightSide, {
  alpha: 0.01,
  alternative: 'less',
  mu: -mindiff
})
return !t.valid()
138}
139
140// If you need to brush up on your online mean variance algorithms then take
141// a look at:
142//  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
143// This is required for O(n^2) exact optimization of the interval. If a non
144// online-variant was used, the complexity would be O(n^3).
145class OnlineMeanVariance {
constructor () {
  this.reset()
}
149
reset () {
  this.size = 0
  this.mean = 0
  this.sse = 0 // sum of square errors
}
155
save () {
  return {
    size: this.size,
    mean: this.mean,
    sse: this.sse
  }
}
163
load (save) {
  this.size = save.size
  this.mean = save.mean
  this.sse = save.sse
}
169
// Welford online algorithm is as follow:
// - defined is:
// δ_1 = x_n - µ_{n-1}
// δ_2 = x_n - µ_n
// Note that these will become computationally avaliable as they are needed.
// This is due how to the online µ changes durring computation.
176
// - Then compute:
// ~ δ_1 is now avaliable
// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
// µ_n = µ_{n-1} + δ_2 / n
// ~ δ_2 is now avaliable
// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
// S_n = S_{n-1} + δ_1 * δ_2
add (x) {
  this.size += 1
  const delta1 = x - this.mean
  this.mean += delta1 / this.size
  const delta2 = x - this.mean
  this.sse += delta1 * delta2
}
191
// This shows the inverse equations derived from the Welford operations
// ~ δ_2 is now avaliable
// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
// µ_{n-1} =  µ_n - (x_n - µ_{n-1}) / n
// n µ_{n-1} =  n µ_n - x_n + µ_{n-1}
// (n - 1) µ_{n-1} = n µ_n - x_n
// (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n
// µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)
// µ_{n-1} = µ_n - δ_2 / (n - 1)
// ~ δ_1 is now avaliable
// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
// S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)
// S_{n-1} = S_n - δ_1 * δ_2
remove (x) {
  this.size -= 1
  const delta2 = x - this.mean
  this.mean -= delta2 / this.size
  const delta1 = x - this.mean
  this.sse -= delta2 * delta1
}
212
// This function is only used for the t-test. We want to minimize the MSE of
// the curve fit. Since the number of observations stays constant, there is no
// need calculate the extra divition in the scans. Thus the SSE can be used
// directly.
// σ_n = S_n / (n - 1)
get variance () {
  if (this.size < 2) {
    return NaN
  } else {
    return this.sse / (this.size - 1)
  }
}
225}

1	`'use strict'`
2
3	`const ttest = require('ttest')`
4	`const atol = 1e-9`
5
6	`// The main source of handle variance should come from the middle part that`
7	`// contains the benchmark data. To bias the interval guessing, such that`
8	`// the middle part contains the most variance, its sum-squared-error (sse)`
9	`// is considered lower than the left and right parts.`
10	`const varianceBias = 0.25 // (1/2)**2`
11
12	`const trimTime = 100 // ms`
13
14	`function guessInterval (data) {`
15	`// This function seperates handle sequence up into three partitions.`
16	`// This is to guess the benchmarking interval, in a typical benchmark the`
17	`// handle curve will look like:`
18	`//`
19	`// handles`
20	`// \|`
21	`// \| \|^^^^^\|`
22	`// \| -----\| \|-----`
23	`// ------------------------ time`
24	`//`
25	`// The lines can be very flat but we need to allow for some variation.`
26	`// The problem of partition generation is thus treated as piecewise least`
27	`// square regression problem, with unknown partitions.`
28	`// There are no good solutions to this problem, but since it is a 1D problem`
29	`// and we can constrain the curves to be flat. The solution can be brute`
30	`// forced in O(n^2) time. Doing this does require an online mean variance`
31	`// algorithm, thus the Welford algorithm is used together with an derived`
32	`// inverse Welford algorithm.`
33	`const left = new OnlineMeanVariance()`
34	`const middle = new OnlineMeanVariance()`
35	`const right = new OnlineMeanVariance()`
36
37	`// For performance, restructure the data into a dense array`
38	`const timestamps = data.map((d) => d.timestamp)`
39	`const handles = data.map((d) => d.handles)`
40
41	`// Start by assigning all observations to the middle partition, it will`
42	`// have the interval [0, handles.length]`
43	`for (const datum of handles) {`
44	`middle.add(datum)`
45	`}`
46
47	`// Use the current partion, of everything being assigning to the middle`
48	`// as the initial solution.`
49	`let bestTotalError = middle.sse`
50	`let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)`
51	`let bestRightIndex = handles.length // corresponds to slice(righIndex)`
52
53	`// Brute force all valid combinations of leftIndex and rightIndex`
54	`// Middle should always include at least one value, so don't scan to`
55	// the end but instead `handles.length - 1`.
56	`for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {`
57	`// set interval to [leftIndex, handles.length]`
58	`if (leftIndex >= 0) {`
59	`left.add(handles[leftIndex])`
60	`middle.remove(handles[leftIndex])`
61	`}`
62	`right.reset()`
63
64	`// try all valid rightIndex values`
65	`// because the middle is going to mutate as we scan from the right, save`
66	`// the middle stat for the interval [leftIndex, handles.length]`
67	`const oldMiddleState = middle.save()`
68	`// Middle should always include at least one value, so don't scan to`
69	// leftIndex but instead `leftIndex + 1`.
70	`for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {`
71	`// set interval to [leftIndex, rightIndex]`
72	`if (rightIndex < handles.length) {`
73	`right.add(handles[rightIndex])`
74	`middle.remove(handles[rightIndex])`
75	`}`
76
77	`// check for improvement, improvement must exceede the absolute tolerance`
78	`if (middle.size >= 2 &&`
79	`bestTotalError > left.sse + varianceBias * middle.sse + right.sse + atol &&`
80	`statisticallyLessThan(left, middle) &&`
81	`statisticallyLessThan(right, middle)) {`
82	`bestTotalError = left.sse + varianceBias * middle.sse + right.sse`
83	`bestLeftIndex = leftIndex + 1`
84	`bestRightIndex = rightIndex`
85	`}`
86	`}`
87
88	`// In prepeation for next iteration, restore the middle stat to the`
89	`// interval [leftIndex, handles.length]`
90	`middle.load(oldMiddleState)`
91	`}`
92
93	// Trim the left and right index by `trimTime` milliseconds, but not more
94	`const leftTime = timestamps[bestLeftIndex]`
95	`for (let i = bestLeftIndex; i <= bestRightIndex - 1; i++) {`
96	`if (timestamps[i] - leftTime >= trimTime) {`
97	`break`
98	`}`
99	`bestLeftIndex = i`
100	`}`
101
102	`const rightTime = timestamps[bestRightIndex - 1]`
103	`for (let i = bestRightIndex; i >= bestLeftIndex; i--) {`
104	`if (rightTime - timestamps[i - 1] >= trimTime) {`
105	`break`
106	`}`
107	`bestRightIndex = i`
108	`}`
109
110	`// Return the .slice value`
111	`return [bestLeftIndex, bestRightIndex]`
112	`}`
113
114	`module.exports = guessInterval`
115
116	`function statisticallyLessThan (leftSide, rightSide) {`
117	`// Since handles are in steps of 1, looking for differences less than`
118	`// 1 doesn't really make sense. The intervals are of course means and thus`
119	`// the a valid difference could be less than one. But in practice we are`
120	`// at a benchmarked server, thus the difference will always be much larger.`
121	`const mindiff = 1`
122
123	`// if the variance is very small or couldn't be estimated, just default`
124	`// to a naive mean comparison.`
125	`if (leftSide.variance < atol \|\| rightSide.variance < atol \|\|`
126	`Number.isNaN(leftSide.variance) \|\| Number.isNaN(rightSide.variance)) {`
127	`return leftSide.mean + mindiff < rightSide.mean`
128	`}`
129
130	`// variance is high, use a statistical t-test to check if there is a`
131	`// difference`
132	`const t = ttest(leftSide, rightSide, {`
133	`alpha: 0.01,`
134	`alternative: 'less',`
135	`mu: -mindiff`
136	`})`
137	`return !t.valid()`
138	`}`
139
140	`// If you need to brush up on your online mean variance algorithms then take`
141	`// a look at:`
142	`// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance`
143	`// This is required for O(n^2) exact optimization of the interval. If a non`
144	`// online-variant was used, the complexity would be O(n^3).`
145	`class OnlineMeanVariance {`
146	`constructor () {`
147	`this.reset()`
148	`}`
149
150	`reset () {`
151	`this.size = 0`
152	`this.mean = 0`
153	`this.sse = 0 // sum of square errors`
154	`}`
155
156	`save () {`
157	`return {`
158	`size: this.size,`
159	`mean: this.mean,`
160	`sse: this.sse`
161	`}`
162	`}`
163
164	`load (save) {`
165	`this.size = save.size`
166	`this.mean = save.mean`
167	`this.sse = save.sse`
168	`}`
169
170	`// Welford online algorithm is as follow:`
171	`// - defined is:`
172	`// δ_1 = x_n - µ_{n-1}`
173	`// δ_2 = x_n - µ_n`
174	`// Note that these will become computationally avaliable as they are needed.`
175	`// This is due how to the online µ changes durring computation.`
176
177	`// - Then compute:`
178	`// ~ δ_1 is now avaliable`
179	`// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n`
180	`// µ_n = µ_{n-1} + δ_2 / n`
181	`// ~ δ_2 is now avaliable`
182	`// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)`
183	`// S_n = S_{n-1} + δ_1 * δ_2`
184	`add (x) {`
185	`this.size += 1`
186	`const delta1 = x - this.mean`
187	`this.mean += delta1 / this.size`
188	`const delta2 = x - this.mean`
189	`this.sse += delta1 * delta2`
190	`}`
191
192	`// This shows the inverse equations derived from the Welford operations`
193	`// ~ δ_2 is now avaliable`
194	`// µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n`
195	`// µ_{n-1} = µ_n - (x_n - µ_{n-1}) / n`
196	`// n µ_{n-1} = n µ_n - x_n + µ_{n-1}`
197	`// (n - 1) µ_{n-1} = n µ_n - x_n`
198	`// (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n`
199	`// µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)`
200	`// µ_{n-1} = µ_n - δ_2 / (n - 1)`
201	`// ~ δ_1 is now avaliable`
202	`// S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)`
203	`// S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)`
204	`// S_{n-1} = S_n - δ_1 * δ_2`
205	`remove (x) {`
206	`this.size -= 1`
207	`const delta2 = x - this.mean`
208	`this.mean -= delta2 / this.size`
209	`const delta1 = x - this.mean`
210	`this.sse -= delta2 * delta1`
211	`}`
212
213	`// This function is only used for the t-test. We want to minimize the MSE of`
214	`// the curve fit. Since the number of observations stays constant, there is no`
215	`// need calculate the extra divition in the scans. Thus the SSE can be used`
216	`// directly.`
217	`// σ_n = S_n / (n - 1)`
218	`get variance () {`
219	`if (this.size < 2) {`
220	`return NaN`
221	`} else {`
222	`return this.sse / (this.size - 1)`
223	`}`
224	`}`
225	`}`