Getting the data

load.data = function (name, has.issue) {
  return(list(
    gc=read.csv(paste0(name, '/', name, '-gcevents.csv'), strip.white=T),
    stat=read.csv(paste0(name, '/', name, '-processstat.csv'), strip.white=T)
  ))
};

dat.sample.1 = load.data('10145.clinic-doctor')
dat.sample.2 = load.data('12318.clinic-doctor')
dat.sample.3 = load.data('3139.clinic-doctor')
print.data.table(dat.sample.1$gc)
interval startTimestamp endTimestamp type
0 1.51188e+12 1.51188e+12 SCAVENGE
0 1.51188e+12 1.51188e+12 SCAVENGE
0 1.51188e+12 1.51188e+12 SCAVENGE
0 1.51188e+12 1.51188e+12 SCAVENGE
0 1.51188e+12 1.51188e+12 SCAVENGE
0 1.51188e+12 1.51188e+12 INCREMENTAL_MARKING
print.data.table(dat.sample.1$stat)
timestamp interval delay cpu memory.rss memory.heapTotal memory.heapUsed memory.external handles
1.51188e+12 0 167.104022 1.1124310 44048384 25067520 18032920 408161 3
1.51188e+12 0 0.764206 0.2614220 44142592 25067520 18121384 408350 3
1.51188e+12 0 2.871670 0.0258708 44146688 25067520 18129704 408419 3
1.51188e+12 0 0.255304 0.0182345 44146688 25067520 18134104 408488 3
1.51188e+12 0 1.186313 0.0172532 44146688 25067520 18138376 408557 3
1.51188e+12 0 1.871463 0.0173525 44146688 25067520 18142648 408626 3
subset.interval = function (dat) {
  dat.gc = dat$gc[dat$gc$interval == 1, ]
  dat.stat = dat$stat[dat$stat$interval == 1, ]
  
  offset = dat.stat[1, 'timestamp']
  
  dat.stat$time = as.POSIXct((dat.stat$timestamp - offset) / 1000, origin="1970-01-01", tz="GMT")
  dat.gc$startTime = as.POSIXct((dat.gc$startTimestamp - offset) / 1000, origin="1970-01-01", tz="GMT")
  dat.gc$endTime = as.POSIXct((dat.gc$endTimestamp - offset) / 1000, origin="1970-01-01", tz="GMT")
  
  return(list(
    gc=dat.gc,
    stat=dat.stat
  ))
}

dat.sample.1 = subset.interval(dat.sample.1)
dat.sample.2 = subset.interval(dat.sample.2)
dat.sample.3 = subset.interval(dat.sample.2)
print.data.table(dat.sample.3$gc)
interval startTimestamp endTimestamp type
10 1 1.511966e+12 1.511966e+12 SCAVENGE
11 1 1.511966e+12 1.511966e+12 INCREMENTAL_MARKING
12 1 1.511966e+12 1.511966e+12 MARK_SWEEP_COMPACT
13 1 1.511966e+12 1.511966e+12 SCAVENGE
14 1 1.511966e+12 1.511966e+12 SCAVENGE
15 1 1.511966e+12 1.511966e+12 SCAVENGE
print.data.table(dat.sample.3$stat)
timestamp interval delay cpu memory.rss memory.heapTotal memory.heapUsed memory.external handles
113 1.511966e+12 1 29.262148 1.4371603 59179008 37085184 24149248 58063 871
114 1.511966e+12 1 19.561119 1.3971054 61825024 55959552 24500528 57963 999
115 1.511966e+12 1 44.112346 1.3991631 62373888 59629568 26785712 58063 1127
116 1.511966e+12 1 16.410012 1.4713359 62566400 59629568 29982552 58193 1200
117 1.511966e+12 1 4.488328 0.9864492 63397888 59629568 32472776 58263 1256
118 1.511966e+12 1 4.557432 0.9692644 65548288 59629568 34636368 58333 1265

Plot data

dat = melt(dat.sample.3$stat, id.vars=c("time", "timestamp", "interval"))

p = ggplot(dat)
p = p + geom_rect(data = dat.sample.3$gc, aes(xmin=startTime, xmax=endTime, ymin=-Inf, ymax=Inf, fill=type), alpha=0.8)
p = p + geom_line(aes(x = time, y = value))
p = p + facet_grid(variable ~ ., scales='free_y')
p = p + scale_x_datetime(labels = date_format("%S sec"))
p = p + scale_y_continuous(limits = c(0, NA))
p = p + theme(legend.position="bottom")
print(p)

Model hypothesis

If delay and MARK_SWEEP_COMPACT are correlated.

Model

dat.annotate.msc = function (dat) {
  msc = dat$gc[dat$gc$type == 'MARK_SWEEP_COMPACT', ]
  
  annotate = rep(F, nrow(dat$stat))
  for(i in 1:nrow(msc)) {
    intervalStart = msc[i, 'startTimestamp']
    intervalEnd = msc[i, 'endTimestamp'] + 20
    
    annotate = annotate | (intervalStart <= dat$stat$timestamp & dat$stat$timestamp <= intervalEnd)
  }
  
  dat.stat = data.frame(dat$stat)
  dat.stat$msc = annotate
  return(list(
    gc=dat$gc,
    stat=dat.stat
  ))
}

dat.sample.3 = dat.annotate.msc(dat.sample.3)
p = ggplot(dat.sample.3$stat, aes(x = time, y = delay, colour=msc))
p = p + geom_point()
p = p + scale_x_datetime(labels = date_format("%S sec"))
p = p + scale_y_continuous(limits = c(0, NA))
p = p + theme(legend.position="bottom")
print(p)

analysis.msc.delay = function (dat) {
  dat.stat = dat$stat
  
  return(data.frame(list(
    msc = c(T, F),
    median = c(
      median(dat.stat[dat.stat$msc == T, 'delay']),
      median(dat.stat[dat.stat$msc == F, 'delay'])
    ),
    max = c(
      max(dat.stat[dat.stat$msc == T, 'delay']),
      max(dat.stat[dat.stat$msc == F, 'delay'])
    )
  )))
}

kable(analysis.msc.delay(dat.sample.3))
msc median max
TRUE 230.559838 807.8275
FALSE 3.332706 356.5879