UNPKG

7.64 kBJavaScriptView Raw
1'use strict'
2
3const ttest = require('ttest')
4const atol = 1e-9
5
6// The main source of handle variance should come from the middle part that
7// contains the benchmark data. To bias the interval guessing, such that
8// the middle part contains the most variance, its sum-squared-error (sse)
9// is considered lower than the left and right parts.
10const varianceBias = 0.25 // (1/2)**2
11
12const trimTime = 100 // ms
13
14function guessInterval (data) {
15 // This function seperates handle sequence up into three partitions.
16 // This is to guess the benchmarking interval, in a typical benchmark the
17 // handle curve will look like:
18 //
19 // handles
20 // |
21 // | |^^^^^|
22 // | -----| |-----
23 // ------------------------ time
24 //
25 // The lines can be very flat but we need to allow for some variation.
26 // The problem of partition generation is thus treated as piecewise least
27 // square regression problem, with unknown partitions.
28 // There are no good solutions to this problem, but since it is a 1D problem
29 // and we can constrain the curves to be flat. The solution can be brute
30 // forced in O(n^2) time. Doing this does require an online mean variance
31 // algorithm, thus the Welford algorithm is used together with an derived
32 // inverse Welford algorithm.
33 const left = new OnlineMeanVariance()
34 const middle = new OnlineMeanVariance()
35 const right = new OnlineMeanVariance()
36
37 // For performance, restructure the data into a dense array
38 const timestamps = data.map((d) => d.timestamp)
39 const handles = data.map((d) => d.handles)
40
41 // Start by assigning all observations to the middle partition, it will
42 // have the interval [0, handles.length]
43 for (const datum of handles) {
44 middle.add(datum)
45 }
46
47 // Use the current partion, of everything being assigning to the middle
48 // as the initial solution.
49 let bestTotalError = middle.sse
50 let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)
51 let bestRightIndex = handles.length // corresponds to slice(righIndex)
52
53 // Brute force all valid combinations of leftIndex and rightIndex
54 // Middle should always include at least one value, so don't scan to
55 // the end but instead `handles.length - 1`.
56 for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {
57 // set interval to [leftIndex, handles.length]
58 if (leftIndex >= 0) {
59 left.add(handles[leftIndex])
60 middle.remove(handles[leftIndex])
61 }
62 right.reset()
63
64 // try all valid rightIndex values
65 // because the middle is going to mutate as we scan from the right, save
66 // the middle stat for the interval [leftIndex, handles.length]
67 const oldMiddleState = middle.save()
68 // Middle should always include at least one value, so don't scan to
69 // leftIndex but instead `leftIndex + 1`.
70 for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {
71 // set interval to [leftIndex, rightIndex]
72 if (rightIndex < handles.length) {
73 right.add(handles[rightIndex])
74 middle.remove(handles[rightIndex])
75 }
76
77 // check for improvement, improvement must exceede the absolute tolerance
78 if (middle.size >= 2 &&
79 bestTotalError > left.sse + varianceBias * middle.sse + right.sse + atol &&
80 statisticallyLessThan(left, middle) &&
81 statisticallyLessThan(right, middle)) {
82 bestTotalError = left.sse + varianceBias * middle.sse + right.sse
83 bestLeftIndex = leftIndex + 1
84 bestRightIndex = rightIndex
85 }
86 }
87
88 // In prepeation for next iteration, restore the middle stat to the
89 // interval [leftIndex, handles.length]
90 middle.load(oldMiddleState)
91 }
92
93 // Trim the left and right index by `trimTime` milliseconds, but not more
94 const leftTime = timestamps[bestLeftIndex]
95 for (let i = bestLeftIndex; i <= bestRightIndex - 1; i++) {
96 if (timestamps[i] - leftTime >= trimTime) {
97 break
98 }
99 bestLeftIndex = i
100 }
101
102 const rightTime = timestamps[bestRightIndex - 1]
103 for (let i = bestRightIndex; i >= bestLeftIndex; i--) {
104 if (rightTime - timestamps[i - 1] >= trimTime) {
105 break
106 }
107 bestRightIndex = i
108 }
109
110 // Return the .slice value
111 return [bestLeftIndex, bestRightIndex]
112}
113
114module.exports = guessInterval
115
116function statisticallyLessThan (leftSide, rightSide) {
117 // Since handles are in steps of 1, looking for differences less than
118 // 1 doesn't really make sense. The intervals are of course means and thus
119 // the a valid difference could be less than one. But in practice we are
120 // at a benchmarked server, thus the difference will always be much larger.
121 const mindiff = 1
122
123 // if the variance is very small or couldn't be estimated, just default
124 // to a naive mean comparison.
125 if (leftSide.variance < atol || rightSide.variance < atol ||
126 Number.isNaN(leftSide.variance) || Number.isNaN(rightSide.variance)) {
127 return leftSide.mean + mindiff < rightSide.mean
128 }
129
130 // variance is high, use a statistical t-test to check if there is a
131 // difference
132 const t = ttest(leftSide, rightSide, {
133 alpha: 0.01,
134 alternative: 'less',
135 mu: -mindiff
136 })
137 return !t.valid()
138}
139
140// If you need to brush up on your online mean variance algorithms then take
141// a look at:
142// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
143// This is required for O(n^2) exact optimization of the interval. If a non
144// online-variant was used, the complexity would be O(n^3).
145class OnlineMeanVariance {
146 constructor () {
147 this.reset()
148 }
149
150 reset () {
151 this.size = 0
152 this.mean = 0
153 this.sse = 0 // sum of square errors
154 }
155
156 save () {
157 return {
158 size: this.size,
159 mean: this.mean,
160 sse: this.sse
161 }
162 }
163
164 load (save) {
165 this.size = save.size
166 this.mean = save.mean
167 this.sse = save.sse
168 }
169
170 // Welford online algorithm is as follow:
171 // - defined is:
172 // δ_1 = x_n - µ_{n-1}
173 // δ_2 = x_n - µ_n
174 // Note that these will become computationally avaliable as they are needed.
175 // This is due how to the online µ changes durring computation.
176
177 // - Then compute:
178 // ~ δ_1 is now avaliable
179 // µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
180 // µ_n = µ_{n-1} + δ_2 / n
181 // ~ δ_2 is now avaliable
182 // S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
183 // S_n = S_{n-1} + δ_1 * δ_2
184 add (x) {
185 this.size += 1
186 const delta1 = x - this.mean
187 this.mean += delta1 / this.size
188 const delta2 = x - this.mean
189 this.sse += delta1 * delta2
190 }
191
192 // This shows the inverse equations derived from the Welford operations
193 // ~ δ_2 is now avaliable
194 // µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
195 // µ_{n-1} = µ_n - (x_n - µ_{n-1}) / n
196 // n µ_{n-1} = n µ_n - x_n + µ_{n-1}
197 // (n - 1) µ_{n-1} = n µ_n - x_n
198 // (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n
199 // µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)
200 // µ_{n-1} = µ_n - δ_2 / (n - 1)
201 // ~ δ_1 is now avaliable
202 // S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
203 // S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)
204 // S_{n-1} = S_n - δ_1 * δ_2
205 remove (x) {
206 this.size -= 1
207 const delta2 = x - this.mean
208 this.mean -= delta2 / this.size
209 const delta1 = x - this.mean
210 this.sse -= delta2 * delta1
211 }
212
213 // This function is only used for the t-test. We want to minimize the MSE of
214 // the curve fit. Since the number of observations stays constant, there is no
215 // need calculate the extra divition in the scans. Thus the SSE can be used
216 // directly.
217 // σ_n = S_n / (n - 1)
218 get variance () {
219 if (this.size < 2) {
220 return NaN
221 } else {
222 return this.sse / (this.size - 1)
223 }
224 }
225}