#!/usr/bin/env bash
# Watchdog for replicas-engine: monitors the process and restarts on crash.
# Uses exponential backoff to avoid tight restart loops on persistent failures.
# Resets backoff after the engine runs stably for STABLE_THRESHOLD seconds.
set -u

WARMING_FLAG="${1:-}"
PIDFILE="/tmp/replicas-engine.pid"
MAX_BACKOFF=60
BACKOFF=1
RESTART_COUNT=0
MAX_RESTARTS=20
STABLE_THRESHOLD=300

log() {
  echo "[$(date -Iseconds)] [engine-watchdog] $*"
}

cleanup() {
  log "Watchdog shutting down"
  if [ -f "$PIDFILE" ]; then
    local pid
    pid=$(cat "$PIDFILE")
    if kill -0 "$pid" 2>/dev/null; then
      kill "$pid" 2>/dev/null
      wait "$pid" 2>/dev/null
    fi
  fi
  rm -f "$PIDFILE"
  exit 0
}

trap cleanup SIGTERM SIGINT

# Default RLIMIT_MEMLOCK is 8 MB; engine needs ~120 MB. Cap at 512 MB so a
# runaway can't lock the whole sandbox.
sudo prlimit --memlock=536870912:536870912 --pid $$ 2>/dev/null || true

LOCKMEM_SO=/usr/local/lib/replicas-lockmem.so
if [ -r "$LOCKMEM_SO" ]; then
  # Prepend (don't replace) so any inherited LD_PRELOAD still loads.
  if [ "${LD_PRELOAD:-}" = "$LOCKMEM_SO" ] \
    || [[ "${LD_PRELOAD:-}" == "$LOCKMEM_SO:"* ]] \
    || [[ "${LD_PRELOAD:-}" == *":$LOCKMEM_SO" ]] \
    || [[ "${LD_PRELOAD:-}" == *":$LOCKMEM_SO:"* ]]; then
    ENGINE_LD_PRELOAD="$LD_PRELOAD"
  elif [ -n "${LD_PRELOAD:-}" ]; then
    ENGINE_LD_PRELOAD="$LOCKMEM_SO:$LD_PRELOAD"
  else
    ENGINE_LD_PRELOAD="$LOCKMEM_SO"
  fi
else
  ENGINE_LD_PRELOAD=""
fi

ENGINE_BIN=$(command -v replicas-engine || true)
if [ -z "$ENGINE_BIN" ]; then
  log "replicas-engine not found on PATH"
  exit 127
fi

ENGINE_CMD=("$ENGINE_BIN")
if [ -r "$ENGINE_BIN" ]; then
  IFS= read -r ENGINE_SHEBANG < "$ENGINE_BIN" || ENGINE_SHEBANG=""
  if [[ "$ENGINE_SHEBANG" == \#!*node* ]]; then
    NODE_BIN=$(command -v node || true)
    if [ -z "$NODE_BIN" ]; then
      log "node not found on PATH for Node shebang engine: $ENGINE_BIN"
      exit 127
    fi
    ENGINE_CMD=("$NODE_BIN" "$ENGINE_BIN")
  fi
fi
if [ -n "$WARMING_FLAG" ]; then
  ENGINE_CMD+=("$WARMING_FLAG")
fi

while true; do
  if [ "$RESTART_COUNT" -ge "$MAX_RESTARTS" ]; then
    log "Exceeded max restarts ($MAX_RESTARTS). Giving up."
    exit 1
  fi

  BOOTSTRAP_LOG="/tmp/replicas-engine-bootstrap-$(date +%Y%m%d-%H%M%S).log"
  log "Starting replicas-engine (attempt $((RESTART_COUNT + 1)))${WARMING_FLAG:+ [warming mode]}"

  START_TIME=$(date +%s)
  if [ -n "$ENGINE_LD_PRELOAD" ]; then
    LD_PRELOAD="$ENGINE_LD_PRELOAD" "${ENGINE_CMD[@]}" > >(cat >> "$BOOTSTRAP_LOG") 2>&1 &
  else
    "${ENGINE_CMD[@]}" > >(cat >> "$BOOTSTRAP_LOG") 2>&1 &
  fi
  ENGINE_PID=$!
  echo "$ENGINE_PID" > "$PIDFILE"

  wait "$ENGINE_PID"
  EXIT_CODE=$?
  END_TIME=$(date +%s)
  UPTIME=$((END_TIME - START_TIME))

  rm -f "$PIDFILE"

  # Exit code 0 = clean shutdown, 143 = SIGTERM (sandbox stopping)
  if [ $EXIT_CODE -eq 0 ] || [ $EXIT_CODE -eq 143 ]; then
    log "Engine exited (code $EXIT_CODE). Not restarting."
    exit 0
  fi

  RESTART_COUNT=$((RESTART_COUNT + 1))

  if [ "$UPTIME" -ge "$STABLE_THRESHOLD" ]; then
    BACKOFF=1
    RESTART_COUNT=1
    log "Engine crashed (exit code $EXIT_CODE, uptime ${UPTIME}s) after stable run. Resetting backoff."
  else
    log "Engine crashed (exit code $EXIT_CODE, uptime ${UPTIME}s). Restart #$RESTART_COUNT in ${BACKOFF}s..."
    sleep "$BACKOFF"
    if [ "$BACKOFF" -lt "$MAX_BACKOFF" ]; then
      BACKOFF=$((BACKOFF * 2))
      if [ "$BACKOFF" -gt "$MAX_BACKOFF" ]; then
        BACKOFF=$MAX_BACKOFF
      fi
    fi
  fi
done
