# Citation Integrity Framework Schema
# Based on REF-059 LitLLM Citation Processing
# Issues: #231 (Retrieval-First Policy), #232 (Whitelist Enforcement), #234 (Page Validation)

$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/citation-integrity/v1"
title: "Citation Integrity Framework Schema"
description: |
  Comprehensive citation integrity framework implementing retrieval-first policy,
  corpus whitelist enforcement, and page number validation per REF-059 LitLLM.

type: object
required:
  - version
  - retrieval_first_policy
  - whitelist_enforcement
  - page_validation

properties:
  version:
    type: string
    pattern: "^\\d+\\.\\d+\\.\\d+$"
    default: "1.0.0"

  retrieval_first_policy:
    $ref: "#/$defs/RetrievalFirstPolicy"

  whitelist_enforcement:
    $ref: "#/$defs/WhitelistEnforcement"

  page_validation:
    $ref: "#/$defs/PageValidation"

$defs:
  RetrievalFirstPolicy:
    type: object
    description: "Formal retrieval-first policy for citation generation"
    properties:
      enabled:
        type: boolean
        default: true

      policy_statement:
        type: string
        default: |
          CRITICAL: Never generate citations from memory or training data.

          Required workflow:
          1. Retrieve sources from AIWG research corpus
          2. Verify source relevance to claim
          3. Extract citation metadata from retrieved document
          4. Format citation using retrieved metadata
          5. Include page numbers or section references

      prohibited_actions:
        type: array
        items: { type: string }
        default:
          - "Generating citations from LLM training data"
          - "Citing papers not in AIWG corpus"
          - "Fabricating DOIs, URLs, or author lists"
          - "Adding 'relevant' papers without retrieval"
          - "Guessing page numbers or sections"

      required_workflow:
        type: array
        items:
          type: object
          properties:
            step: { type: integer }
            action: { type: string }
            validation: { type: string }
        default:
          - step: 1
            action: "Search corpus for relevant sources"
            validation: "At least one Read tool call to .aiwg/research/corpus/"
          - step: 2
            action: "Verify source relevance to claim"
            validation: "Source content supports the claim being made"
          - step: 3
            action: "Extract citation metadata"
            validation: "Use REF-XXX identifier from filename"
          - step: 4
            action: "Format citation properly"
            validation: "Include [REF-XXX] or inline reference"
          - step: 5
            action: "Add page/section reference"
            validation: "Specific location in source document"

      corpus_expansion_protocol:
        type: object
        properties:
          when_source_not_found:
            type: string
            default: |
              When a claim would benefit from citation but no suitable source
              exists in the AIWG research corpus:

              1. Do NOT cite from memory
              2. Inform user: "This claim would benefit from citation, but no
                 suitable source exists in the corpus."
              3. Recommend addition: "Consider adding [Paper] to corpus using
                 `aiwg add-research-paper`"
              4. Continue without citation or mark as [citation needed]

          addition_command:
            type: string
            default: "aiwg add-research-paper --title \"...\" --url \"...\" --relevance \"...\""

  WhitelistEnforcement:
    type: object
    description: "Corpus-as-whitelist enforcement"
    properties:
      enabled:
        type: boolean
        default: true

      corpus_path:
        type: string
        default: ".aiwg/research/corpus/"

      whitelist_principle:
        type: string
        default: |
          The research corpus (.aiwg/research/corpus/) is the ONLY authorized
          source list. Any citation not in the corpus is FORBIDDEN.

          For any citation C:
            IF C ∉ Authorized Citations THEN
              REJECT with error "Citation not in corpus"
            ELSE
              ALLOW with metadata from corpus file

      enforcement_levels:
        type: object
        properties:
          agent_level:
            type: object
            properties:
              description: { type: string, default: "Agents only cite corpus sources" }
              rules:
                type: array
                items: { type: string }
                default:
                  - "ONLY cite sources from .aiwg/research/corpus/"
                  - "Use REF-XXX identifier from filename"
                  - "If needed source not in corpus, STOP and recommend addition"
                  - "Never generate citations from memory"

          validation_level:
            type: object
            properties:
              description: { type: string, default: "Writing-Validator checks whitelist" }
              check_pattern:
                type: string
                default: "REF-\\d{3}"
              on_violation:
                type: object
                properties:
                  severity: { type: string, default: "critical" }
                  message: { type: string, default: "Citation not in corpus whitelist" }
                  action: { type: string, default: "Remove citation or add source to corpus first" }

          commit_level:
            type: object
            properties:
              description: { type: string, default: "Optional pre-commit hook" }
              enabled: { type: boolean, default: false }
              script: |
                #!/bin/bash
                # Extract all REF-XXX citations from staged markdown
                REFS=$(git diff --cached "*.md" | grep -oE "REF-[0-9]{3}" | sort -u)

                for REF in $REFS; do
                  if ! ls .aiwg/research/corpus/${REF}-*.md 1>/dev/null 2>&1; then
                    echo "ERROR: Citation ${REF} not in corpus"
                    exit 1
                  fi
                done

      allowed_citation_formats:
        type: array
        items: { type: string }
        default:
          - "[REF-XXX]"
          - "(REF-XXX)"
          - "[REF-XXX, p.XX]"
          - "[REF-XXX, Section X]"

      forbidden_citation_formats:
        type: array
        items: { type: string }
        default:
          - "(Smith et al., 2023)"
          - "[1]"
          - "Author (Year)"
          - "Any non-corpus reference"

  PageValidation:
    type: object
    description: "Page number validation for citations"
    properties:
      enabled:
        type: boolean
        default: true

      document_metadata_schema:
        type: object
        properties:
          total_pages:
            type: integer
            description: "Total page count of document"
          page_range:
            type: string
            description: "e.g., '1-24'"
          sections:
            type: array
            items:
              type: object
              properties:
                name: { type: string }
                pages: { type: string }

      key_quote_schema:
        type: object
        properties:
          quote:
            type: string
          page:
            type: integer
          section:
            type: string
          validated:
            type: boolean
            default: false
          validation_date:
            type: string
            format: date
          validator:
            type: string
            description: "@username"

      validation_rules:
        type: object
        properties:
          page_existence:
            type: object
            properties:
              description: { type: string, default: "Page number within document range" }
              rule: { type: string, default: "1 <= page <= total_pages" }

          section_consistency:
            type: object
            properties:
              description: { type: string, default: "Page within declared section" }
              rule: { type: string, default: "section.start <= page <= section.end" }

          quote_verification:
            type: object
            properties:
              description: { type: string, default: "Quote exists on cited page (if PDF available)" }
              rule: { type: string, default: "extract_text(pdf, page).contains(quote)" }

      validation_workflow:
        type: object
        properties:
          manual_checklist:
            type: array
            items: { type: string }
            default:
              - "Record total page count"
              - "Document section page ranges"
              - "Verify each Key Quote page number"
              - "Mark each quote as validated"
              - "Sign and date validation"

          automated_checks:
            type: array
            items:
              type: object
              properties:
                check: { type: string }
                automated: { type: boolean }
            default:
              - { check: "Page within range", automated: true }
              - { check: "Section consistency", automated: true }
              - { check: "Quote on page (PDF)", automated: false }

# Citation validation result
citation_validation_result:
  type: object
  properties:
    ref_id:
      type: string
    status:
      type: string
      enum: [valid, invalid, warning]
    checks:
      type: object
      properties:
        in_corpus:
          type: boolean
        page_valid:
          type: boolean
        section_consistent:
          type: boolean
        quote_verified:
          type: boolean
    issues:
      type: array
      items:
        type: object
        properties:
          severity: { type: string }
          message: { type: string }
          suggestion: { type: string }

# Agent citation workflow
agent_citation_workflow:
  type: object
  properties:
    when_citation_needed:
      type: array
      items: { type: string }
      default:
        - "1. Search corpus: @.aiwg/research/corpus/*.md"
        - "2. If no match: Recommend adding source, do NOT cite"
        - "3. If match: Extract metadata from corpus file"
        - "4. Use REF-XXX identifier from filename"
        - "5. Include page/section reference"
        - "6. Verify quote exists at cited location"

    example_valid:
      type: string
      default: |
        ✅ ALLOWED:
        "Voice consistency improves quality [REF-043, p.15]"
        (corpus file .aiwg/research/corpus/REF-043-voice-consistency.md exists)

    example_invalid:
      type: string
      default: |
        ❌ FORBIDDEN:
        "Voice consistency improves quality (Smith et al., 2023)"
        (not in corpus, citation from memory)

# CLI commands
cli_commands:
  validate_citations:
    command: "aiwg validate-citations <file>"
    description: "Validate all citations in file"
    options:
      - name: "--strict"
        description: "Fail on any warning"
      - name: "--fix"
        description: "Remove invalid citations"

  corpus_check:
    command: "aiwg corpus-check"
    description: "List citations not in corpus"

  page_validate:
    command: "aiwg page-validate <ref-id>"
    description: "Validate page numbers for a paper"

  citation_audit:
    command: "aiwg citation-audit"
    description: "Full citation integrity audit"
    output:
      - "Corpus coverage"
      - "Page validation status"
      - "Quote verification status"

# Agent protocol
agent_protocol:
  generate_citation:
    description: "Generate citation with integrity checks"
    steps:
      - identify_claim_needing_citation
      - search_corpus_for_source
      - if_not_found:
          - do_not_cite
          - recommend_corpus_addition
          - mark_citation_needed
      - if_found:
          - read_corpus_file
          - extract_ref_id
          - identify_relevant_quote
          - validate_page_number
          - format_citation
          - return_citation

  validate_citation:
    description: "Validate existing citation"
    steps:
      - extract_ref_id_from_citation
      - check_corpus_whitelist
      - if_not_in_corpus:
          - flag_critical_error
      - if_in_corpus:
          - validate_page_number
          - validate_section_consistency
          - if_pdf_available:
              - verify_quote_on_page
      - return_validation_result

  audit_all_citations:
    description: "Audit all citations in document"
    steps:
      - extract_all_citations
      - for_each_citation:
          - validate_citation
          - record_result
      - generate_audit_report
      - return_summary

# Storage
storage:
  corpus: ".aiwg/research/corpus/"
  validation_logs: ".aiwg/logs/citation-validation/"
  audit_reports: ".aiwg/reports/citation-audit/"

# Research targets (from REF-059 LitLLM)
research_targets:
  retrieval_first: "Never generate citations without retrieval"
  corpus_whitelist: "Only cite sources in research corpus"
  page_validation: "Verify page numbers exist and are correct"
  quote_verification: "Confirm quotes exist at cited locations"

# Example validation report
example_validation_report: |
  ================================================================================
                         CITATION INTEGRITY AUDIT
  ================================================================================

  Document: docs/voice-framework/technical-guide.md
  Date: 2026-01-25
  Auditor: writing-validator

  SUMMARY:
    Total Citations: 15
    Valid: 12
    Warnings: 2
    Errors: 1

  DETAILS:

  ✓ [REF-043, p.15] - Valid
    - In corpus: Yes
    - Page valid: Yes (1-24 range)
    - Section consistent: Yes (Results)

  ⚠ [REF-018, p.8] - Warning
    - In corpus: Yes
    - Page valid: Yes
    - Section consistent: No (cited as Introduction, actually Methods)

  ✗ [REF-099] - Error
    - In corpus: No
    - Action: Remove citation or add REF-099 to corpus

  ⚠ [REF-021, p.45] - Warning
    - In corpus: Yes
    - Page valid: No (document has 32 pages)
    - Action: Verify correct page number

  RECOMMENDATIONS:
  1. Remove or add REF-099 to corpus
  2. Verify REF-018 section reference
  3. Correct REF-021 page number

# References
references:
  research:
    - "@.aiwg/research/findings/REF-059-litllm-citation-processing.md"
  implementation:
    - "#231"
    - "#232"
    - "#234"
  related:
    - "@.aiwg/research/corpus/"
    - "@agentic/code/agents/writing-validator.md"
    - "@agentic/code/frameworks/sdlc-complete/schemas/flows/grade-evidence-quality.yaml"
