/*
 *  This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
 *  License: AGPLv3 s.t. "Commons Clause" – see LICENSE.md for details
 */

// endpoints for various health checks

import getLogger from "@cocalc/backend/logger";
const { new_counter } = require("@cocalc/hub/metrics-recorder");
import { howLongDisconnectedMins } from "@cocalc/database/postgres/record-connect-error";
import type { PostgreSQL } from "@cocalc/database/postgres/types";
import { seconds2hms } from "@cocalc/util/misc";
import express, { Response } from "express";
import { createServer, Server } from "net";
import { isFloat } from "validator";
import { database_is_working } from "./hub_register";
const logger = getLogger("hub:healthcheck");
const { debug: L } = logger;

const HEALTHCHECKS = new_counter(
  "healthchecks_total",
  "test healthcheck counter",
  ["status"]
);

interface HealthcheckData {
  code: 200 | 404;
  txt: string;
}

// self termination is only activated, if there is a COCALC_HUB_SELF_TERMINATE environment variable
// it's value is an interval in hours, minimum and maximum, for how long it should be alive
// and a drain period in minutes at the end.
// e.g. "24,48,15" for an uptime between 1 and 2 days and 15 minutes of draining
function init_self_terminate(): {
  startup: number;
  shutdown?: number; // when to shutdown (causes a failed health check)
  drain?: number; // when to start draining, causes a proxy server to no longer send traffic
} {
  const D = logger.extend("init_self_terminate").debug;
  const startup = Date.now();
  const conf = process.env.COCALC_HUB_SELF_TERMINATE;
  if (conf == null) {
    D("COCALC_HUB_SELF_TERMINATE env var not set, hence no self-termination");
    return { startup };
  }
  const [from_str, to_str, drain_str] = conf.trim().split(",");
  if (!isFloat(from_str, { gt: 0 }))
    throw new Error("COCALC_HUB_SELF_TERMINATE/from not a positive float");
  if (!isFloat(to_str, { gt: 0 }))
    throw new Error("COCALC_HUB_SELF_TERMINATE/to not a positive float");
  if (!isFloat(drain_str, { gt: 0 }))
    throw new Error("COCALC_HUB_SELF_TERMINATE/drain not a positive float");
  const from = parseFloat(from_str);
  const to = parseFloat(to_str);
  const drain_h = parseFloat(drain_str) / 60; // minutes to hours
  D("parsed data:", { from, to, drain_h });
  if (from > to)
    throw Error(
      "COCALC_HUB_SELF_TERMINATE 'from' must be smaller than 'to', e.g. '24,48,15'"
    );
  const uptime = Math.random() * (to - from); // hours
  const hours2ms = 1000 * 60 * 60;
  const shutdown = startup + (from + uptime) * hours2ms;
  const drain = shutdown - drain_h * hours2ms;
  if (startup > drain) {
    throw new Error(
      `COCALC_HUB_SELF_TERMINATE: startup must be smaller than drain – ${startup}>${drain}`
    );
  }
  D({
    startup: new Date(startup).toISOString(),
    drain: new Date(drain).toISOString(),
    shutdown: new Date(shutdown).toISOString(),
    uptime: seconds2hms((hours2ms * uptime) / 1000),
    draintime: seconds2hms((drain_h * hours2ms) / 1000),
  });
  return { startup, shutdown, drain };
}

const { startup, shutdown, drain } = init_self_terminate();

let agent_port = 0;
let agent_host = "0.0.0.0";
export function set_agent_endpoint(port: number, host: string) {
  L(`set_agent_endpoint ${agent_host}:${agent_port}`);
  agent_port = port;
  agent_host = host;
}

let agent_check_server: Server | undefined;

// HAProxy agent-check TCP endpoint
// https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-agent-check
// for development, set the env var in your startup script or terminal init file
// export COCALC_HUB_SELF_TERMINATE=.1,.2,1
// and then query it like that
// $ telnet 0.0.0.0 $(cat $COCALC_ROOT/dev/project/ports/agent-port)
function setup_agent_check() {
  if (agent_port == 0 || drain == null) {
    L("setup_agent_check: agent_port not set, no agent checks");
    return;
  }

  // TODO this could also return a "weight" for this server, based on load values
  // there is also "drain", but we set it to "10%" to avoid a nasty situation, when all endpoints are draining.
  // ATTN: weight must be set as well, which is poorly documented here:
  // https://cbonte.github.io/haproxy-dconv/2.0/configuration.html#5.2-weight
  agent_check_server = createServer((c) => {
    let msg = Date.now() < drain ? "ready up 100%" : "10%";
    c.write(msg + "\r\n");
    c.destroy();
  });

  agent_check_server.listen(agent_port, agent_host);
  L(`setup_agent_check: listening on ${agent_host}:${agent_port}`);
}

export interface Check {
  status: string;
  abort?: boolean;
}

interface Opts {
  db: PostgreSQL;
  router: express.Router;
  extra?: (() => Promise<Check>)[]; // additional health checks
}

// this could be directly in setup_health_checks, but we also need it in proxy.coffee
// proxy.coffee must be rewritten and restructured first – just wrapping it with a router
// didn't work at all for me
export function process_alive(): HealthcheckData {
  let txt = "alive: YES";
  let is_dead = true;
  if (!database_is_working()) {
    // this will stop haproxy from routing traffic to us
    // until db connection starts working again.
    txt = "alive: NO – database not working";
  } else if (shutdown != null && Date.now() > shutdown) {
    txt = "alive: NO – shutdown initiated";
  } else {
    is_dead = false;
  }
  const code = is_dead ? 404 : 200;
  return { txt, code };
}

function checkConcurrent(db: PostgreSQL): Check {
  const c = db.concurrent();
  if (c >= db._concurrent_warn) {
    return {
      status: `hub not healthy, since concurrent ${c} >= ${db._concurrent_warn}`,
      abort: true,
    };
  } else {
    return { status: `concurrent ${c} < ${db._concurrent_warn}` };
  }
}

function checkUptime(): Check {
  const now = Date.now();
  const uptime = seconds2hms((now - startup) / 1000);
  if (shutdown != null && drain != null) {
    if (now >= shutdown) {
      const msg = `uptime ${uptime} – expired, terminating now`;
      L(msg);
      return { status: msg, abort: true };
    } else {
      const until = seconds2hms((shutdown - now) / 1000);
      const drain_str =
        drain > now
          ? `draining in ${seconds2hms((drain - now) / 1000)}`
          : "draining now";
      const msg = `uptime ${uptime} – ${drain_str} – terminating in ${until}`;
      L(msg);
      return { status: msg };
    }
  } else {
    const msg = `uptime ${uptime} – no self-termination`;
    L(msg);
    return { status: msg };
  }
}

// if there are is no connection to the database for that many minutes,
// declare the hub unhealthy
const DB_ERRORS_THRESHOLD_MIN = parseInt(
  process.env.COCALC_DB_ERRORS_THRESHOLD_MIN ?? "5"
);

function checkDBConnectivity(): Check {
  if (DB_ERRORS_THRESHOLD_MIN <= 0) {
    return { status: "db connectivity check disabled" };
  }
  const num = howLongDisconnectedMins();
  if (num == null) {
    return { status: "no DB connection problems", abort: false };
  }
  // round num to 2 decimal places
  const numStr = num.toFixed(2);
  const above = num >= DB_ERRORS_THRESHOLD_MIN;
  const status = above
    ? `DB problems for ${numStr} >= ${DB_ERRORS_THRESHOLD_MIN} mins`
    : `DB problems for ${numStr} < ${DB_ERRORS_THRESHOLD_MIN} mins`;
  return { status, abort: above };
}

// same note as above for process_alive()
async function process_health_check(
  db: PostgreSQL,
  extra: (() => Promise<Check>)[] = []
): Promise<HealthcheckData> {
  let any_abort = false;
  let txt = "healthchecks:\n";
  for (const test of [
    () => checkConcurrent(db),
    checkUptime,
    checkDBConnectivity,
    ...extra,
  ]) {
    try {
      const { status, abort = false } = await test();
      const statusTxt = abort ? "FAIL" : "OK";
      txt += `${status} – ${statusTxt}\n`;
      any_abort = any_abort || abort;
      L(`process_health_check: ${status} – ${statusTxt}`);
    } catch (err) {
      L(`process_health_check ERRROR: ${err}`);
      HEALTHCHECKS.labels("ERROR").inc();
    }
  }
  const code = any_abort ? 404 : 200;
  HEALTHCHECKS.labels(any_abort ? "FAIL" : "OK").inc();
  return { code, txt };
}

export async function setup_health_checks(opts: Opts): Promise<void> {
  const { db, extra, router } = opts;
  setup_agent_check();

  // used by HAPROXY for testing that this hub is OK to receive traffic
  router.get("/alive", (_, res: Response) => {
    const { code, txt } = process_alive();
    res.type("txt");
    res.status(code);
    res.send(txt);
  });

  // this is a more general check than concurrent-warn
  // additionally to checking the database condition, it also self-terminates
  // this hub if it is running for quite some time. beyond that, in the future
  // there could be even more checks on top of that.
  router.get("/healthcheck", async (_, res: Response) => {
    const { txt, code } = await process_health_check(db, extra);
    res.status(code);
    res.type("txt");
    res.send(txt);
  });

  // /concurrent-warn -- could be used by kubernetes to decide whether or not to kill the container; if
  // below the warn thresh, returns number of concurrent connection; if hits warn, then
  // returns 404 error, meaning hub may be unhealthy.  Kubernetes will try a few times before
  // killing the container.  Will also return 404 if there is no working database connection.
  router.get("/concurrent-warn", (_, res) => {
    res.type("txt");
    if (!database_is_working()) {
      L("/concurrent-warn: not healthy, since database connection not working");
      res.status(404).end();
      return;
    }

    const c = db.concurrent();
    if (c >= db._concurrent_warn) {
      L(
        `/concurrent-warn: not healthy, since concurrent ${c} >= ${db._concurrent_warn}`
      );
      res.status(404).end();
      return;
    }
    res.send(`${c}`);
  });

  // Return number of concurrent connections (could be useful)
  router.get("/concurrent", (_, res) => {
    res.type("txt");
    res.send(`${db.concurrent()}`);
  });
}
