groups:
  - name: cfn_loop_recording_rules
    interval: 30s
    rules:
      # Agent spawn rate (per minute)
      - record: cfn:agent_spawn_rate:1m
        expr: rate(cfn_agent_spawns_total[1m])

      # Agent execution success rate
      - record: cfn:agent_success_rate:5m
        expr: |
          sum(rate(cfn_agent_executions_total{status="success"}[5m])) by (team, agent_type)
          /
          sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)

      # Average agent execution duration (P50, P95, P99)
      - record: cfn:agent_duration:p50
        expr: histogram_quantile(0.50, rate(cfn_agent_execution_duration_seconds_bucket[5m]))

      - record: cfn:agent_duration:p95
        expr: histogram_quantile(0.95, rate(cfn_agent_execution_duration_seconds_bucket[5m]))

      - record: cfn:agent_duration:p99
        expr: histogram_quantile(0.99, rate(cfn_agent_execution_duration_seconds_bucket[5m]))

      # Cost aggregation by team
      - record: cfn:cost_by_team:1h
        expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (team)

      # Cost aggregation by project
      - record: cfn:cost_by_project:1h
        expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (project)

      # Token usage by provider
      - record: cfn:tokens_by_provider:1h
        expr: sum(increase(cfn_agent_tokens_total[1h])) by (provider, token_type)

  - name: cfn_loop_alerts
    rules:
      # High agent failure rate
      - alert: HighAgentFailureRate
        expr: |
          (
            sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type)
            /
            sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)
          ) > 0.10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High agent failure rate detected"
          description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 10% (current: {{ $value | humanizePercentage }})"

      # Critical agent failure rate
      - alert: CriticalAgentFailureRate
        expr: |
          (
            sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type)
            /
            sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)
          ) > 0.25
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Critical agent failure rate detected"
          description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 25% (current: {{ $value | humanizePercentage }})"

      # Slow agent execution (P95 above 5 minutes)
      - alert: SlowAgentExecution
        expr: cfn:agent_duration:p95 > 300
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Slow agent execution detected"
          description: "P95 agent execution time is above 5 minutes (current: {{ $value | humanizeDuration }})"

      # Health check failures
      - alert: HealthCheckFailure
        expr: |
          rate(cfn_health_check_failure_total[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Health check failures detected"
          description: "Health check {{ $labels.check_type }} is failing (error: {{ $labels.error_type }})"

      # Critical health check failures (multiple checks failing)
      - alert: CriticalHealthCheckFailure
        expr: |
          count(rate(cfn_health_check_failure_total[5m]) > 0) >= 2
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Multiple health checks failing"
          description: "Multiple health checks are failing - system may be unhealthy"

      # High cost per hour
      - alert: HighCostPerHour
        expr: |
          sum(rate(cfn_agent_cost_dollars_total[1h])) by (team) > 10
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "High hourly cost detected"
          description: "Team {{ $labels.team }} is spending more than $10/hour (current: ${{ $value | humanize }})"

      # Docker operation failures
      - alert: DockerOperationFailures
        expr: |
          rate(cfn_docker_operations_total{status="failure"}[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Docker operation failures detected"
          description: "Docker {{ $labels.operation }} operations are failing at {{ $value | humanizePercentage }}"

      # High memory usage by agents
      - alert: HighAgentMemoryUsage
        expr: |
          cfn_agent_memory_usage_bytes > 2147483648  # 2GB
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High agent memory usage"
          description: "Agent {{ $labels.agent_id }} ({{ $labels.agent_type }}) is using {{ $value | humanize1024 }} of memory"

      # CFN Loop stuck (no progress for 30 minutes)
      - alert: CFNLoopStuck
        expr: |
          (time() - max(cfn_loop_iterations_total) by (task_id)) > 1800
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "CFN Loop appears stuck"
          description: "Task {{ $labels.task_id }} has not progressed in over 30 minutes"

      # Low consensus score
      - alert: LowConsensusScore
        expr: |
          cfn_loop_consensus_score < 0.7
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low CFN Loop consensus score"
          description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low consensus ({{ $value | humanizePercentage }})"

      # Low test pass rate
      - alert: LowTestPassRate
        expr: |
          cfn_loop_test_pass_rate < 0.95
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Low CFN Loop test pass rate"
          description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low test pass rate ({{ $value | humanizePercentage }})"

      # P0 Critical Infrastructure Alerts
      - alert: RedisConnectivityLost
        expr: up{job="redis"} == 0
        for: 1m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "Redis connectivity lost"
          description: "Redis instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required."
          runbook: "docs/runbooks/redis-connection-loss.md"

      - alert: PostgreSQLConnectivityLost
        expr: up{job="postgres"} == 0
        for: 1m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "PostgreSQL connectivity lost"
          description: "PostgreSQL instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required."
          runbook: "docs/runbooks/postgres-connection-loss.md"

      - alert: DockerDaemonUnavailable
        expr: up{job="docker"} == 0
        for: 2m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "Docker daemon unavailable"
          description: "Docker daemon on {{ $labels.instance }} is unavailable for 2 minutes."
          runbook: "docs/runbooks/docker-daemon-unavailable.md"

      - alert: DiskSpaceCritical
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.10
        for: 5m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "Disk space critical on {{ $labels.instance }}"
          description: "Less than 10% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)"
          runbook: "docs/runbooks/disk-space-exhaustion.md"

      - alert: HealthCheckConsecutiveFailures
        expr: |
          count_over_time((rate(cfn_health_check_failure_total[5m]) > 0)[15m:]) >= 3
        for: 1m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "Three consecutive health check failures"
          description: "System health checks have failed 3 or more times consecutively"

      - alert: CFNLoopStuckCritical
        expr: |
          (time() - max(cfn_loop_iterations_total) by (task_id)) > 3600
        for: 5m
        labels:
          severity: critical
          priority: P0
        annotations:
          summary: "CFN Loop critically stuck for over 1 hour"
          description: "Task {{ $labels.task_id }} has not progressed in over 1 hour"
          runbook: "docs/runbooks/cfn-loop-stuck.md"

      # P1 Warning Alerts
      - alert: BackupFailure
        expr: up{job="backup"} == 0 or increase(backup_failure_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
          priority: P1
        annotations:
          summary: "Backup operation failed"
          description: "Backup job on {{ $labels.instance }} has failed"
          runbook: "docs/runbooks/backup-failure.md"

      - alert: CertificateExpiringSoon
        expr: (ssl_cert_expiry_seconds - time()) < (7 * 24 * 3600)
        for: 1h
        labels:
          severity: warning
          priority: P1
        annotations:
          summary: "SSL certificate expiring within 7 days"
          description: "Certificate {{ $labels.cn }} expires in less than 7 days"
          runbook: "docs/runbooks/certificate-expiration.md"

      - alert: AgentMemoryCritical
        expr: |
          cfn_agent_memory_usage_bytes > 3221225472  # 3GB
        for: 5m
        labels:
          severity: warning
          priority: P1
        annotations:
          summary: "Agent memory usage exceeds 3GB"
          description: "Agent {{ $labels.agent_type }} on team {{ $labels.team }} using {{ $value | humanizeBytes }}"
          runbook: "docs/runbooks/memory-exhaustion.md"

      # P2 Info Alerts
      - alert: UnusualAgentSpawnRate
        expr: |
          abs(cfn:agent_spawn_rate:1m - avg_over_time(cfn:agent_spawn_rate:1m[1h]))
          > 2 * stddev_over_time(cfn:agent_spawn_rate:1m[1h])
        for: 10m
        labels:
          severity: info
          priority: P2
        annotations:
          summary: "Unusual agent spawn rate detected"
          description: "Agent spawn rate is {{ $value }} standard deviations from normal"

      - alert: DiskSpaceWarning
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.20
        for: 10m
        labels:
          severity: info
          priority: P2
        annotations:
          summary: "Disk space warning on {{ $labels.instance }}"
          description: "Less than 20% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)"
          runbook: "docs/runbooks/disk-space-exhaustion.md"

      - alert: APIRateLimitApproaching
        expr: |
          (api_requests_total / api_rate_limit_total) > 0.80
        for: 5m
        labels:
          severity: info
          priority: P2
        annotations:
          summary: "API rate limit approaching for {{ $labels.provider }}"
          description: "{{ $labels.provider }} API usage at {{ $value | humanizePercentage }} of rate limit"

      - alert: ConsensusScoreLow
        expr: |
          cfn_loop_consensus_score < 0.90 and cfn_loop_consensus_score >= 0.70
        for: 5m
        labels:
          severity: info
          priority: P2
        annotations:
          summary: "CFN Loop consensus score below optimal"
          description: "Task {{ $labels.task_id }} has consensus score {{ $value | humanizePercentage }} (below 0.90 threshold)"
