#!/bin/bash
# upgrade-guardian.sh — post-upgrade health monitor
# Started by daemon after npm install, runs as detached background process.
# Monitors healthz for 90s, rolls back on crash_count >= 3.
# This file is NOT inside the npm package — installed once to ~/.grix/bin/.

GRIX_HOME="${GRIX_CONNECTOR_HOME:-$HOME/.grix}"
PENDING_FILE="$GRIX_HOME/data/upgrade-pending.json"
LOCK_FILE="$GRIX_HOME/upgrade-guardian.lock"
LOG_FILE="$GRIX_HOME/log/upgrade.log"
MAX_CRASHES=3
HEALTH_TIMEOUT=90
HEALTH_PORT_FILE="$GRIX_HOME/data/health-port"
HEALTH_PORT=$(cat "$HEALTH_PORT_FILE" 2>/dev/null || echo "19579")
HEALTH_URL="http://127.0.0.1:${HEALTH_PORT}/healthz"
NPM_PACKAGE="grix-connector"

# Ensure directories exist
mkdir -p "$GRIX_HOME/log" "$GRIX_HOME/data"

log() {
    echo "[$(date '+%Y-%m-%dT%H:%M:%S%z')] guardian: $1" >> "$LOG_FILE"
}

# Prevent duplicate guardian instances
if [ -f "$LOCK_FILE" ]; then
    LOCK_PID=$(cat "$LOCK_FILE" 2>/dev/null)
    if [ -n "$LOCK_PID" ] && kill -0 "$LOCK_PID" 2>/dev/null; then
        exit 0
    fi
    rm -f "$LOCK_FILE"
fi
echo $$ > "$LOCK_FILE"
trap 'rm -f "$LOCK_FILE"' EXIT

# Read pending marker
if [ ! -f "$PENDING_FILE" ]; then
    exit 0
fi

# Extract fields from JSON (portable: no jq dependency)
FROM_VERSION=$(cat "$PENDING_FILE" | grep -o '"from_version":"[^"]*"' | cut -d'"' -f4)
TARGET_VERSION=$(cat "$PENDING_FILE" | grep -o '"target_version":"[^"]*"' | cut -d'"' -f4)
UPGRADED_AT=$(cat "$PENDING_FILE" | grep -o '"upgraded_at":"[^"]*"' | cut -d'"' -f4)
CRASH_COUNT=$(cat "$PENDING_FILE" | grep -o '"crash_count":[0-9]*' | cut -d: -f2)
CRASH_COUNT=${CRASH_COUNT:-0}

# Validate pending file
if [ -z "$FROM_VERSION" ]; then
    log "pending file corrupt, missing from_version, removing"
    rm -f "$PENDING_FILE"
    exit 0
fi

log "started: from=$FROM_VERSION target=$TARGET_VERSION crash_count=$CRASH_COUNT"

# Wait for old daemon to exit (SIGTERM + shutdown + restart)
sleep 15

# Monitor loop: wait for healthz to return 200
ELAPSED=0
while [ $ELAPSED -lt $HEALTH_TIMEOUT ]; do
    if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
        log "healthz passed, upgrade successful"
        rm -f "$PENDING_FILE"
        exit 0
    fi
    sleep 3
    ELAPSED=$((ELAPSED + 3))
done

# Extra grace period for slow systems
sleep 15
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
    log "healthz passed (delayed), upgrade successful"
    rm -f "$PENDING_FILE"
    exit 0
fi

# Health check failed
CRASH_COUNT=$((CRASH_COUNT + 1))
log "healthz timeout, crash_count=$CRASH_COUNT"

if [ $CRASH_COUNT -ge $MAX_CRASHES ]; then
    log "rollback: installing ${NPM_PACKAGE}@${FROM_VERSION}"
    if npm install -g "${NPM_PACKAGE}@${FROM_VERSION}" --prefer-online --no-audit --no-fund >> "$LOG_FILE" 2>&1; then
        log "rollback succeeded"
        rm -f "$PENDING_FILE"
    else
        log "ROLLBACK FAILED — manual intervention required"
    fi
    exit 0
fi

# Under threshold — update crash_count and exit, let process manager restart
echo "{\"from_version\":\"$FROM_VERSION\",\"target_version\":\"$TARGET_VERSION\",\"upgraded_at\":\"$UPGRADED_AT\",\"crash_count\":$CRASH_COUNT}" > "$PENDING_FILE.tmp"
mv "$PENDING_FILE.tmp" "$PENDING_FILE"
log "updated crash_count to $CRASH_COUNT, exiting"
