1 |
|
2 |
|
3 | const ttest = require('ttest')
|
4 | const atol = 1e-9
|
5 |
|
6 | // The main source of handle variance should come from the middle part that
|
7 | // contains the benchmark data. To bias the interval guessing, such that
|
8 | // the middle part contains the most variance, its sum-squared-error (sse)
|
9 | // is considered lower than the left and right parts.
|
10 | const varianceBias = 0.25 // (1/2)**2
|
11 |
|
12 | const trimTime = 100 // ms
|
13 |
|
14 | function guessInterval (data) {
|
15 | // This function seperates handle sequence up into three partitions.
|
16 | // This is to guess the benchmarking interval, in a typical benchmark the
|
17 | // handle curve will look like:
|
18 | //
|
19 | // handles
|
20 | // |
|
21 | // | |^^^^^|
|
22 | // | -----| |-----
|
23 | // ------------------------ time
|
24 | //
|
25 | // The lines can be very flat but we need to allow for some variation.
|
26 | // The problem of partition generation is thus treated as piecewise least
|
27 | // square regression problem, with unknown partitions.
|
28 | // There are no good solutions to this problem, but since it is a 1D problem
|
29 | // and we can constrain the curves to be flat. The solution can be brute
|
30 | // forced in O(n^2) time. Doing this does require an online mean variance
|
31 | // algorithm, thus the Welford algorithm is used together with an derived
|
32 | // inverse Welford algorithm.
|
33 | const left = new OnlineMeanVariance()
|
34 | const middle = new OnlineMeanVariance()
|
35 | const right = new OnlineMeanVariance()
|
36 |
|
37 | // For performance, restructure the data into a dense array
|
38 | const timestamps = data.map((d) => d.timestamp)
|
39 | const handles = data.map((d) => d.handles)
|
40 |
|
41 | // Start by assigning all observations to the middle partition, it will
|
42 | // have the interval [0, handles.length]
|
43 | for (const datum of handles) {
|
44 | middle.add(datum)
|
45 | }
|
46 |
|
47 | // Use the current partion, of everything being assigning to the middle
|
48 | // as the initial solution.
|
49 | let bestTotalError = middle.sse
|
50 | let bestLeftIndex = 0 // corresponds to slice(0, leftIndex)
|
51 | let bestRightIndex = handles.length // corresponds to slice(righIndex)
|
52 |
|
53 | // Brute force all valid combinations of leftIndex and rightIndex
|
54 | // Middle should always include at least one value, so don't scan to
|
55 | // the end but instead `handles.length - 1`.
|
56 | for (let leftIndex = -1; leftIndex < handles.length - 1; leftIndex++) {
|
57 | // set interval to [leftIndex, handles.length]
|
58 | if (leftIndex >= 0) {
|
59 | left.add(handles[leftIndex])
|
60 | middle.remove(handles[leftIndex])
|
61 | }
|
62 | right.reset()
|
63 |
|
64 | // try all valid rightIndex values
|
65 | // because the middle is going to mutate as we scan from the right, save
|
66 | // the middle stat for the interval [leftIndex, handles.length]
|
67 | const oldMiddleState = middle.save()
|
68 | // Middle should always include at least one value, so don't scan to
|
69 | // leftIndex but instead `leftIndex + 1`.
|
70 | for (let rightIndex = handles.length; rightIndex > leftIndex + 1; rightIndex--) {
|
71 | // set interval to [leftIndex, rightIndex]
|
72 | if (rightIndex < handles.length) {
|
73 | right.add(handles[rightIndex])
|
74 | middle.remove(handles[rightIndex])
|
75 | }
|
76 |
|
77 | // check for improvement, improvement must exceede the absolute tolerance
|
78 | if (middle.size >= 2 &&
|
79 | bestTotalError > left.sse + varianceBias * middle.sse + right.sse + atol &&
|
80 | statisticallyLessThan(left, middle) &&
|
81 | statisticallyLessThan(right, middle)) {
|
82 | bestTotalError = left.sse + varianceBias * middle.sse + right.sse
|
83 | bestLeftIndex = leftIndex + 1
|
84 | bestRightIndex = rightIndex
|
85 | }
|
86 | }
|
87 |
|
88 | // In prepeation for next iteration, restore the middle stat to the
|
89 | // interval [leftIndex, handles.length]
|
90 | middle.load(oldMiddleState)
|
91 | }
|
92 |
|
93 | // Trim the left and right index by `trimTime` milliseconds, but not more
|
94 | const leftTime = timestamps[bestLeftIndex]
|
95 | for (let i = bestLeftIndex; i <= bestRightIndex - 1; i++) {
|
96 | if (timestamps[i] - leftTime >= trimTime) {
|
97 | break
|
98 | }
|
99 | bestLeftIndex = i
|
100 | }
|
101 |
|
102 | const rightTime = timestamps[bestRightIndex - 1]
|
103 | for (let i = bestRightIndex; i >= bestLeftIndex; i--) {
|
104 | if (rightTime - timestamps[i - 1] >= trimTime) {
|
105 | break
|
106 | }
|
107 | bestRightIndex = i
|
108 | }
|
109 |
|
110 | // Return the .slice value
|
111 | return [bestLeftIndex, bestRightIndex]
|
112 | }
|
113 |
|
114 | module.exports = guessInterval
|
115 |
|
116 | function statisticallyLessThan (leftSide, rightSide) {
|
117 | // Since handles are in steps of 1, looking for differences less than
|
118 | // 1 doesn't really make sense. The intervals are of course means and thus
|
119 | // the a valid difference could be less than one. But in practice we are
|
120 | // at a benchmarked server, thus the difference will always be much larger.
|
121 | const mindiff = 1
|
122 |
|
123 | // if the variance is very small or couldn't be estimated, just default
|
124 | // to a naive mean comparison.
|
125 | if (leftSide.variance < atol || rightSide.variance < atol ||
|
126 | Number.isNaN(leftSide.variance) || Number.isNaN(rightSide.variance)) {
|
127 | return leftSide.mean + mindiff < rightSide.mean
|
128 | }
|
129 |
|
130 | // variance is high, use a statistical t-test to check if there is a
|
131 | // difference
|
132 | const t = ttest(leftSide, rightSide, {
|
133 | alpha: 0.01,
|
134 | alternative: 'less',
|
135 | mu: -mindiff
|
136 | })
|
137 | return !t.valid()
|
138 | }
|
139 |
|
140 | // If you need to brush up on your online mean variance algorithms then take
|
141 | // a look at:
|
142 | // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
|
143 | // This is required for O(n^2) exact optimization of the interval. If a non
|
144 | // online-variant was used, the complexity would be O(n^3).
|
145 | class OnlineMeanVariance {
|
146 | constructor () {
|
147 | this.reset()
|
148 | }
|
149 |
|
150 | reset () {
|
151 | this.size = 0
|
152 | this.mean = 0
|
153 | this.sse = 0 // sum of square errors
|
154 | }
|
155 |
|
156 | save () {
|
157 | return {
|
158 | size: this.size,
|
159 | mean: this.mean,
|
160 | sse: this.sse
|
161 | }
|
162 | }
|
163 |
|
164 | load (save) {
|
165 | this.size = save.size
|
166 | this.mean = save.mean
|
167 | this.sse = save.sse
|
168 | }
|
169 |
|
170 | // Welford online algorithm is as follow:
|
171 | // - defined is:
|
172 | // δ_1 = x_n - µ_{n-1}
|
173 | // δ_2 = x_n - µ_n
|
174 | // Note that these will become computationally avaliable as they are needed.
|
175 | // This is due how to the online µ changes durring computation.
|
176 |
|
177 | // - Then compute:
|
178 | // ~ δ_1 is now avaliable
|
179 | // µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
|
180 | // µ_n = µ_{n-1} + δ_2 / n
|
181 | // ~ δ_2 is now avaliable
|
182 | // S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
|
183 | // S_n = S_{n-1} + δ_1 * δ_2
|
184 | add (x) {
|
185 | this.size += 1
|
186 | const delta1 = x - this.mean
|
187 | this.mean += delta1 / this.size
|
188 | const delta2 = x - this.mean
|
189 | this.sse += delta1 * delta2
|
190 | }
|
191 |
|
192 | // This shows the inverse equations derived from the Welford operations
|
193 | // ~ δ_2 is now avaliable
|
194 | // µ_n = µ_{n-1} + (x_n - µ_{n-1}) / n
|
195 | // µ_{n-1} = µ_n - (x_n - µ_{n-1}) / n
|
196 | // n µ_{n-1} = n µ_n - x_n + µ_{n-1}
|
197 | // (n - 1) µ_{n-1} = n µ_n - x_n
|
198 | // (n - 1) µ_{n-1} = (n - 1) µ_n + µ_n - x_n
|
199 | // µ_{n-1} = µ_n + (µ_n - x_n) / (n - 1)
|
200 | // µ_{n-1} = µ_n - δ_2 / (n - 1)
|
201 | // ~ δ_1 is now avaliable
|
202 | // S_n = S_{n-1} + (x_n - µ_{n-1}) * (x_n - µ_n)
|
203 | // S_{n-1} = S_n - (x_n - µ_{n-1}) * (x_n - µ_n)
|
204 | // S_{n-1} = S_n - δ_1 * δ_2
|
205 | remove (x) {
|
206 | this.size -= 1
|
207 | const delta2 = x - this.mean
|
208 | this.mean -= delta2 / this.size
|
209 | const delta1 = x - this.mean
|
210 | this.sse -= delta2 * delta1
|
211 | }
|
212 |
|
213 | // This function is only used for the t-test. We want to minimize the MSE of
|
214 | // the curve fit. Since the number of observations stays constant, there is no
|
215 | // need calculate the extra divition in the scans. Thus the SSE can be used
|
216 | // directly.
|
217 | // σ_n = S_n / (n - 1)
|
218 | get variance () {
|
219 | if (this.size < 2) {
|
220 | return NaN
|
221 | } else {
|
222 | return this.sse / (this.size - 1)
|
223 | }
|
224 | }
|
225 | }
|