# Workflow Checkpoint Schema
# Based on REF-058 R-LAM (systematic checkpoint/recovery)
# Issue: #112, #268 (multi-loop support)

$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/checkpoint/v2"
title: "Workflow Checkpoint Schema"
description: |
  Systematic checkpoint and recovery for workflows to enable resumption
  from last good state after failures. Based on R-LAM findings.
  Version 2 adds multi-loop support with loop_id field.

type: object
required:
  - checkpoint_id
  - workflow_id
  - created_at
  - state

properties:
  checkpoint_id:
    type: string
    format: uuid
    description: "Unique checkpoint identifier"

  workflow_id:
    type: string
    description: "ID of the workflow being checkpointed"

  loop_id:
    type: string
    pattern: "^ralph-[a-z0-9-]+-[a-f0-9]{8}$"
    nullable: true
    description: |
      Loop ID for multi-loop support (v2). Required for ralph_loop workflows.
      Format: ralph-{slug}-{uuid8}
      Null for non-Ralph or legacy single-loop checkpoints.

  workflow_type:
    type: string
    enum:
      - ralph_loop
      - flow_command
      - skill_execution
      - agent_task
    description: "Type of workflow"

  created_at:
    type: string
    format: date-time
    description: "When checkpoint was created"

  trigger:
    type: string
    enum:
      - manual           # User-initiated
      - automatic        # System-initiated
      - phase_boundary   # At phase transition
      - iteration        # At Ralph iteration
      - pre_risky_op     # Before risky operation
      - error_recovery   # During error recovery
    description: "What triggered the checkpoint"

  state:
    type: object
    required: [phase, progress, artifacts]
    description: "Complete workflow state"
    properties:
      phase:
        type: string
        description: "Current SDLC phase"

      step:
        type: string
        description: "Current step within phase"

      progress:
        type: object
        properties:
          total_steps:
            type: integer
          completed_steps:
            type: integer
          percentage:
            type: number
            minimum: 0
            maximum: 100
        description: "Progress tracking"

      iteration:
        type: object
        properties:
          current:
            type: integer
          max:
            type: integer
          failures:
            type: integer
        description: "Ralph iteration state (if applicable)"

      artifacts:
        type: array
        items:
          type: object
          properties:
            path:
              type: string
            hash:
              type: string
            status:
              type: string
              enum: [created, modified, deleted, unchanged]
        description: "Artifact state at checkpoint"

      variables:
        type: object
        additionalProperties: true
        description: "Workflow variables and context"

      agent_state:
        type: object
        properties:
          current_agent:
            type: string
          memory:
            type: object
          pending_actions:
            type: array
            items:
              type: string
        description: "Agent execution state"

  execution_config:
    $ref: "#/$defs/ExecutionConfig"
    description: "Configuration for reproducibility"

  metadata:
    type: object
    properties:
      size_bytes:
        type: integer
      compression:
        type: string
        enum: [none, gzip, lz4]
      retention_days:
        type: integer
        default: 30
      tags:
        type: array
        items:
          type: string
      schema_version:
        type: string
        enum: ["1.0", "2.0"]
        default: "2.0"
        description: "Schema version for migration tracking"
    description: "Checkpoint metadata"

$defs:
  ExecutionConfig:
    type: object
    description: "Configuration snapshot for reproducibility"
    properties:
      model:
        type: string
        description: "Model ID (e.g., claude-3-opus)"
      temperature:
        type: number
        minimum: 0
        maximum: 2
      seed:
        type: integer
        description: "Random seed if set"
      execution_mode:
        type: string
        enum: [strict, seeded, logged, default]
      agent:
        type: string
        description: "Active agent"
      tools:
        type: array
        items:
          type: string
        description: "Available tools"
      inputs:
        type: object
        additionalProperties: true
        description: "Input values"

# Recovery configuration
recovery:
  strategies:
    last_checkpoint:
      description: "Restore most recent checkpoint"
      steps:
        - load_checkpoint
        - restore_state
        - resume_execution

    select_checkpoint:
      description: "Let user choose checkpoint"
      steps:
        - list_checkpoints
        - user_selects
        - load_checkpoint
        - restore_state
        - resume_execution

    smart_rollback:
      description: "Analyze and select best checkpoint"
      steps:
        - analyze_failure
        - find_safe_checkpoint
        - load_checkpoint
        - restore_state
        - resume_execution

  restore_sequence:
    - validate_checkpoint_integrity
    - restore_artifacts
    - restore_variables
    - restore_agent_state
    - verify_state
    - resume_or_fail

# Checkpoint lifecycle
lifecycle:
  creation:
    auto_triggers:
      - phase_start
      - artifact_complete
      - before_external_call
      - iteration_boundary

  retention:
    default_count: 5
    default_days: 30
    on_success: keep_latest
    on_failure: keep_all_recent

  cleanup:
    schedule: daily
    preserve_tagged: true
    compress_old: true

# Storage configuration
storage:
  # Multi-loop path structure (v2)
  multi_loop_location: ".aiwg/ralph/loops/{loop_id}/checkpoints/"
  multi_loop_naming: "iteration-{iteration:03d}.json.gz"

  # Legacy single-loop path (v1, deprecated)
  legacy_location: ".aiwg/checkpoints/"
  legacy_naming: "{workflow_id}-{timestamp}-{trigger}.json.gz"

  # Active format
  format: json
  compression: gzip

# Path Resolution
path_resolution:
  rules:
    - if_loop_id_present: use_multi_loop_location
    - if_workflow_type_ralph_loop_and_no_loop_id: use_legacy_location
    - else: use_legacy_location

  examples:
    multi_loop: ".aiwg/ralph/loops/ralph-fix-tests-a1b2c3d4/checkpoints/iteration-005.json.gz"
    legacy: ".aiwg/checkpoints/ralph-research-impl-2026-01-25T15-00-00Z-iteration.json.gz"

# Backward Compatibility
backward_compatibility:
  single_loop_checkpoints:
    description: |
      Checkpoints without loop_id are valid for single-loop workflows.
      When loading, if loop_id is null and workflow_type is ralph_loop,
      treat as legacy single-loop checkpoint.

    migration_on_read:
      - if_loop_id_null_and_ralph_loop: treat_as_single_loop
      - load_from_legacy_location
      - no_automatic_migration_to_multi_loop

    migration_on_write:
      - if_loop_id_provided: use_multi_loop_path
      - if_loop_id_null: use_legacy_path
      - set_metadata.schema_version: "2.0"

  validation:
    # Validate loop_id if workflow_type is ralph_loop in multi-loop mode
    ralph_loop_requires_loop_id_in_multi_loop: true
    allow_null_loop_id_for_legacy: true

# Example checkpoints
examples:
  # Multi-loop checkpoint (v2)
  - checkpoint_id: "cp-001-multi-loop"
    workflow_id: "ralph-fix-tests-a1b2c3d4"
    loop_id: "ralph-fix-tests-a1b2c3d4"
    workflow_type: ralph_loop
    created_at: "2026-02-02T21:05:00Z"
    trigger: iteration
    state:
      phase: construction
      step: implement_issue
      progress:
        total_steps: 200
        completed_steps: 5
        percentage: 2.5
      iteration:
        current: 5
        max: 200
        failures: 0
      artifacts:
        - path: "src/auth/login.ts"
          hash: "abc123..."
          status: modified
      variables:
        current_issue: "#268"
        implementation_approach: "multi-loop"
    execution_config:
      model: "claude-sonnet-4.5"
      temperature: 0
      execution_mode: strict
      agent: "Software Implementer"
    metadata:
      size_bytes: 4096
      compression: gzip
      retention_days: 30
      schema_version: "2.0"

  # Legacy single-loop checkpoint (v1, backward compatible)
  - checkpoint_id: "cp-002-legacy"
    workflow_id: "ralph-research-impl"
    loop_id: null
    workflow_type: ralph_loop
    created_at: "2026-01-25T15:00:00Z"
    trigger: iteration
    state:
      phase: construction
      step: implement_issue
      progress:
        total_steps: 10
        completed_steps: 3
        percentage: 30
      iteration:
        current: 5
        max: 200
        failures: 0
      artifacts:
        - path: "agentic/code/frameworks/sdlc-complete/schemas/research/checkpoint.yaml"
          hash: "abc123..."
          status: created
      variables:
        current_issue: "#112"
        implementation_approach: "agentic"
    execution_config:
      model: "claude-3-opus"
      temperature: 0
      execution_mode: strict
      agent: "Software Implementer"
    metadata:
      size_bytes: 4096
      compression: gzip
      retention_days: 30
      schema_version: "1.0"

# Migration Guide
migration_guide:
  from_v1_to_v2:
    description: "Adding multi-loop support"
    breaking_changes: false
    steps:
      - add_loop_id_field_nullable
      - update_storage_path_logic
      - maintain_legacy_path_support
      - no_automatic_migration_required

    compatibility:
      v1_checkpoints_readable: true
      v2_checkpoints_backward_compatible: true
      mixed_versions_supported: true

# References
references:
  research:
    - "@.aiwg/research/findings/REF-058-r-lam.md"
  implementation:
    - "#112"  # Original checkpoint implementation
    - "#268"  # Multi-loop support
  related:
    - "@agentic/code/frameworks/sdlc-complete/schemas/flows/error-handling.yaml"
    - "@agentic/code/addons/ralph/schemas/reflection-memory.json"
    - "@agentic/code/addons/ralph/schemas/loop-registry.yaml"
    - "@agentic/code/addons/ralph/schemas/loop-state.yaml"
