#########################################################################
# This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
# License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
#########################################################################

###
Some code specific to running a project in the KuCalc environment.
###

fs = require('fs')
async = require('async')

misc      = require('@cocalc/util/misc')
misc_node = require('@cocalc/backend/misc_node')

path       = require('path')
{execSync} = require('child_process')
{defaults} = misc = require('@cocalc/util/misc')

{start_ts, session_id} = require('./consts')

# global variable
PROJECT_ID = undefined
PREFIX = 'cocalc_project_'

# Prometheus client setup -- https://github.com/siimon/prom-client
prom_client = require('prom-client')

# additionally, record GC statistics
# https://www.npmjs.com/package/prometheus-gc-stats
## I'm commenting this out because the package prometheus-gc-stats
## on npm very explicitly says it does not support prom-client
## version 13, which is what we have installed everywhere.  That
## version is a significant breaking change from version 12, so
## I'm also not comfortable reverting back.  Harald I think force
## upgraded prom-client to version 13 in this commit: b31e087ea2c640f494db15b652d9d0f86e7bd8a5
# require('prometheus-gc-stats')()()

# collect some recommended default metrics every 10 seconds
prom_client.collectDefaultMetrics(timeout: 10 * 1000)

# --- end prometheus setup

# This gets **changed** to true, if a certain
# command line flag is passed in.
exports.IN_KUCALC = false

# status information
current_status = {}

exports.init = (client) ->
    # update project status every 30s
    # TODO: could switch to faster when it's changing and slower when it isn't.
    f = -> update_project_status(client)
    f()
    setInterval(f, 30000)

update_project_status = (client, cb) ->
    dbg = client.dbg("update_status")
    dbg()
    status = undefined
    async.series([
        (cb) ->
            compute_status (err, s) ->
                status = s
                if not err
                    current_status = s
                cb(err)
        (cb) ->
            client.query
                query   :
                    projects : {project_id:client.client_id(), status: status}
                cb      : cb
    ], (err) ->
        cb?(err)
    )

exports.compute_status = compute_status = (cb) ->
    status =
        time       : (new Date()).getTime()
        memory     : {rss: 0}
        disk_MB    : 0
        cpu        : {}
        start_ts   : start_ts
        session_id : session_id
        processes  : {}
    async.parallel([
        (cb) ->
            compute_status_disk(status, cb)
        (cb) ->
            cgroup_stats(status, cb)
        (cb) ->
            processes_info(status, cb)
        (cb) ->
            compute_status_tmp(status, cb)
    ], (err) ->
        cb(err, status)
    )

compute_status_disk = (status, cb) ->
    disk_usage "$HOME", (err, x) ->
        status.disk_MB = x
        cb(err)

processes_info = (status, cb) ->
    cols = ['pid','lstart','time','rss','args']
    misc_node.execute_code
        command : 'ps'
        args    : ['--no-header', '-o', cols.join(','), '-u', 'user']
        bash    : false
        cb      : (err, out) ->
            if err or out.exit_code != 0
                cb(err)
            else
                cnt = -1  # no need to account for the ps process itself!
                # TODO parsing anything out of ps is really hard :-(
                # but we want to know how many sage, jupyter, console, etc. instances are running.
                for line in out.stdout.split('\n')
                    if line.length > 0
                        cnt += 1
                status.processes.count = cnt
                cb()

# NOTE: we use tmpfs for /tmp, so RAM usage is the **sum** of /tmp and what
# processes use.
compute_status_tmp = (status, cb) ->
    disk_usage "/tmp", (err, x) ->
        status.memory.rss += 1000*x
        cb(err)

# this grabs the memory stats directly from the sysfs cgroup files
# the actual usage is the sum of the rss values plus cache, but we leave cache aside
cgroup_stats = (status, cb) ->
    async.parallel({
        memory : (cb) ->
            fs.readFile '/sys/fs/cgroup/memory/memory.stat', 'utf8', (err, data) ->
                if err
                    cb(err)
                    return
                stats = {}
                for line in data.split('\n')
                    [key, value] = line.split(' ')
                    try
                        stats[key] = parseInt(value)
                cb(null, stats)

        cpu : (cb) ->
            fs.readFile '/sys/fs/cgroup/cpu,cpuacct/cpuacct.usage', 'utf8', (err, data) ->
                if err
                    cb(err)
                    return
                try
                    cb(null, parseFloat(data) / Math.pow(10, 9))
                catch
                    cb(null, 0.0)

        oom : (cb) ->
            fs.readFile '/sys/fs/cgroup/memory/memory.oom_control', 'utf8', (err, data) ->
                if err
                    cb(err)
                    return
                try
                    for line in data.split('\n')
                        # search string includes a trailing space, otherwise it matches 'oom_kill_disable'!
                        if misc.startswith(line, 'oom_kill ')
                            cb(null, parseInt(line.split(' ')[1]))
                            return
                cb(null, 0)

    }, (err, res) ->
        kib = 1024 # convert to kibibyte
        # total_rss includes total_rss_huge
        # Ref: https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
        status.memory.rss  += (res.memory.total_rss ? 0) / kib
        status.memory.cache = (res.memory.total_cache ? 0) / kib
        status.memory.limit = (res.memory.hierarchical_memory_limit ? 0) / kib
        status.cpu.usage    = res.cpu
        status.oom_kills    = res.oom
        cb()
    )


disk_usage = (path, cb) ->
    misc_node.execute_code
        command : "df -BM #{path} | tail -1 | awk '{gsub(\"M\",\"\");print $3}'"
        bash    : true
        cb      : (err, out) ->
            if err
                cb(err)
            else
                cb(undefined, parseInt(out.stdout))


# Every 60s, check if we can reach google's internal network -- in kucalc on GCE, this must be blocked.
# If we receive some information, exit with status code 99.
exports.init_gce_firewall_test = (logger, interval_ms=60*1000) ->
    return # temporarily disabled
    if not exports.IN_KUCALC
        logger?.warn("not running firewall test -- not in kucalc")
        return
    URI = 'http://metadata.google.internal/computeMetadata/v1/'
    test_firewall = ->
        logger?.log("test_firewall")
        request = require('request')
        request(
            timeout : 3000
            headers :
              'Metadata-Flavor' : 'Google'
            uri: URI
            method: 'GET'
        , (err, res, body) ->
            if err?.code == 'ETIMEDOUT'
                logger?.log('test_firewall: timeout -> no action')
            else
                logger?.warn('test_firewall', res)
                logger?.warn('test_firewall', body)
                if res? or body?
                    logger?.warn('test_firewall: request went through and got a response -> exiting with code 99')
                    process.exit(99)
                else
                    logger?.warn('test_firewall: request went through with no response -> no action')
        )
    test_firewall()
    setInterval(test_firewall, interval_ms)
    return

get_bugs_total = require('./bug-counter').default
exports.prometheus_metrics = (project_id) ->
    labels = "project_id=\"#{project_id}\",session_id=\"#{session_id}\""
    """
    # HELP cocalc_project_bugs_total The total number of caught bugs.
    # TYPE cocalc_project_bugs_total counter
    cocalc_project_bugs_total{#{labels}} #{get_bugs_total()}
    # HELP cocalc_project_start_time when the project/session started
    # TYPE cocalc_project_start_time counter
    cocalc_project_start_time{#{labels}} #{start_ts}
    # HELP cocalc_project_cpu_usage_seconds
    # TYPE cocalc_project_cpu_usage_seconds counter
    cocalc_project_cpu_usage_seconds{#{labels}} #{current_status.cpu?.usage ? 0.0}
    # HELP cocalc_project_disk_usage_mb
    # TYPE cocalc_project_disk_usage_mb gauge
    cocalc_project_disk_usage_mb{#{labels}} #{current_status.disk_MB ? 0.0}
    # HELP cocalc_project_memory_usage_ki
    # TYPE cocalc_project_memory_usage_ki gauge
    cocalc_project_memory_usage_ki{#{labels}} #{current_status.memory?.rss ? 0.0}
    # HELP cocalc_project_memory_limit_ki
    # TYPE cocalc_project_memory_limit_ki gauge
    cocalc_project_memory_limit_ki{#{labels}} #{current_status.memory?.limit ? 0.0}
    # HELP cocalc_project_running_processes_total
    # TYPE cocalc_project_running_processes_total gauge
    cocalc_project_running_processes_total{#{labels}} #{current_status.processes?.count ? 0}
    # HELP cocalc_project_oom_kills_total
    # TYPE cocalc_project_oom_kills_total counter
    cocalc_project_oom_kills_total{#{labels}} #{current_status.oom_kills ? 0}
    """ + '\n'  # makes sure the response ends with a newline!

# called inside raw_server
exports.init_health_metrics = (raw_server, project_id) ->
    return if not exports.IN_KUCALC
    PROJECT_ID = project_id

    # Setup health and metrics (no url base prefix needed)
    raw_server.use '/health', (req, res) ->
        res.setHeader("Content-Type", "text/plain")
        res.setHeader('Cache-Control', 'no-cache, no-store')
        res.send('OK')

    # prometheus text format -- https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
    raw_server.use '/metrics', (req, res) ->
        res.setHeader("Content-Type", "text/plain; version=0.0.4")
        res.header('Cache-Control', 'no-cache, no-store')
        part1 = exports.prometheus_metrics(project_id)
        res.send(part1 + '\n' + (await prom_client.register.metrics()) + '\n')