import {
  NativeModules,
  NativeEventEmitter,
  Platform,
} from 'react-native';
import type { EmitterSubscription } from 'react-native';

const nativeSTT = NativeModules.STTManager;
const emitter = new NativeEventEmitter(nativeSTT);

/**
 * VAD (Voice Activity Detection) state
 */
export type VADState =
  | 'silence'
  | 'speech_start'
  | 'speech'
  | 'speech_end';

/**
 * VAD sensitivity mode
 * - aggressive: Less sensitive, fewer false positives
 * - normal: Balanced sensitivity (default)
 * - sensitive: More sensitive, catches quieter speech
 */
export type VADMode = 'aggressive' | 'normal' | 'sensitive';

/**
 * Speaker status for diarization
 */
export type SpeakerStatus = 'pending' | 'confirmed';

/**
 * Model type for STT processing
 * - streaming: Real-time partial results (lower accuracy, instant feedback)
 * - offline: VAD-triggered batch processing (higher accuracy, slight delay)
 */
export type ModelType = 'streaming' | 'offline';

/**
 * Model architecture for STT models
 * - transducer: Encoder-Decoder-Joiner (Parakeet, Zipformer, NeMo models)
 * - whisper: Encoder-Decoder only (OpenAI Whisper models)
 */
export type ModelArchitecture = 'transducer' | 'whisper';

/**
 * ONNX Runtime execution provider
 * - cpu: Default, always available
 * - nnapi: Android Neural Networks API (hardware accelerated, Android 8.1+)
 * - gpu: GPU inference via OpenGL ES
 * - coreml: Apple CoreML (iOS only)
 */
export type Provider = 'cpu' | 'nnapi' | 'gpu' | 'coreml';

/**
 * Token timing information for word-level timestamps
 */
export interface TokenTiming {
  token: string;
  startTime: number;
  endTime: number;
  confidence: number;
}

/**
 * VAD configuration for speech segmentation
 */
export interface VADConfig {
  threshold: number; // 0.5 default
  minSpeechDurationMs: number; // 300 default
  minSilenceDurationMs: number; // 500 default
  maxSpeechDurationMs: number; // 30000 default (force break)
  speechPaddingMs: number; // 100 default
  mode: VADMode;
}

/**
 * Diarization configuration for speaker identification
 * Uses dual thresholds to prevent profile degradation from poor matches
 */
export interface DiarizationConfig {
  /** Min similarity for speaker assignment (lenient). Default: 0.55 */
  speakerThreshold: number;
  /** Min similarity for profile update (strict). Default: 0.70 */
  embeddingThreshold: number;
  /** Min speech duration to create speaker (ms). Default: 800 */
  minSpeechDurationMs: number;
  /** Min speech duration to update profile (ms). Default: 500 */
  minEmbeddingUpdateMs: number;
  /** Max tracked speakers (LRU eviction). Default: 10 */
  maxSpeakers: number;
  /** Weight for running average update. Default: 0.2 */
  embeddingAverageWeight: number;
  /** Number of initial segments always assigned to Speaker 1. Default: 3 */
  onboardingSegments: number;
  /** Minimum accumulated speech (ms) before creating new speaker. Default: 3000 */
  minSpeechForNewSpeaker: number;
}

/**
 * Speaker profile information for management APIs
 */
export interface SpeakerProfileInfo {
  speakerId: number;
  status: SpeakerStatus;
  segmentCount: number;
  totalSpeechMs: number;
  lastSeenTimestamp: number;
}

/**
 * Diarization mode
 * - embedding: Uses speaker embeddings with cosine similarity (default)
 * - pyannote: Uses pyannote segmentation model for better speaker change detection
 */
export type DiarizationMode = 'embedding' | 'pyannote';

/**
 * Configuration for pyannote-based speaker segmentation
 * Provides more accurate speaker change detection using the pyannote-segmentation-3.0 model
 */
export interface PyannoteSegmentationConfig {
  /** Path to pyannote segmentation model (model.onnx) */
  segmentationModelPath: string;
  /** Path to speaker embedding model (for clustering) */
  embeddingModelPath: string;
  /** Expected number of speakers (0 for auto-detection using threshold) */
  numSpeakers?: number;
  /** Distance threshold for clustering (used when numSpeakers=0). Smaller = more clusters. Default: 0.5 */
  clusteringThreshold?: number;
  /** Minimum duration for speech segment in seconds. Default: 0.5 */
  minDurationOn?: number;
  /** Minimum duration for silence segment in seconds. Default: 0.3 */
  minDurationOff?: number;
  /** Number of threads for inference. Default: 2 */
  numThreads?: number;
  /** ONNX provider: 'cpu', 'nnapi', or 'gpu'. Default: 'cpu' */
  provider?: Provider;
}

/**
 * A single diarization segment with speaker assignment
 */
export interface DiarizationSegment {
  /** Assigned speaker ID */
  speakerId: number;
  /** Start time in seconds */
  startTime: number;
  /** End time in seconds */
  endTime: number;
  /** Duration in milliseconds */
  durationMs: number;
}

/**
 * Result from pyannote diarization
 */
export interface DiarizationResult {
  /** Array of speaker segments with timestamps */
  segments: DiarizationSegment[];
  /** Number of unique speakers detected */
  speakerCount: number;
  /** List of unique speaker IDs */
  speakers: number[];
  /** Total audio duration in seconds */
  audioDuration: number;
}

/**
 * Multi-speaker detection event
 * Emitted when multiple speakers are detected in a single VAD segment
 */
export interface MultiSpeakerEvent {
  /** List of detected speaker IDs */
  speakers: number[];
  /** Detailed windows showing speaker at each timestamp */
  windows: Array<{ timestampMs: number; speakerId: number }>;
  /** Number of unique speakers */
  speakerCount: number;
}

/**
 * Captured audio file information
 */
export interface CapturedFileInfo {
  path: string;
  name: string;
  size: number;
  lastModified: number;
}

/**
 * Audio capture type
 * - raw: Direct microphone audio before any processing
 * - processed: Audio after denoiser, just before STT model
 */
export type AudioCaptureType = 'raw' | 'processed';

/**
 * Audio capture status
 */
export interface AudioCaptureStatus {
  rawEnabled: boolean;
  processedEnabled: boolean;
}

/**
 * Audio source for recording
 * - voice_communication: Optimized for VoIP with AEC support (default)
 * - voice_recognition: Optimized for speech recognition without VoIP
 * - mic: Default microphone input, raw audio
 */
export type AudioSource = 'voice_communication' | 'voice_recognition' | 'mic';

/**
 * Provider information
 */
export interface ProviderInfo {
  name: Provider;
  available: boolean;
  description: string;
  minApiLevel?: number;
  currentApiLevel?: number;
  glEsVersion?: string;
}

/**
 * Device providers information
 */
export interface DeviceProvidersInfo {
  providers: ProviderInfo[];
  device: string;
  manufacturer: string;
  apiLevel: number;
  androidVersion: string;
  recommended: Provider;
}

/**
 * Configuration for STT initialization
 */
export interface STTConfig {
  // STT model configuration - individual paths to model files
  /** Full path to encoder ONNX model file */
  encoderPath: string;
  /** Full path to decoder ONNX model file */
  decoderPath: string;
  /** Full path to joiner ONNX model file (required for transducer, not used for whisper) */
  joinerPath?: string;
  /** Full path to tokens.txt file */
  tokensPath: string;

  /**
   * Model architecture determines the model configuration:
   * - transducer: Encoder-Decoder-Joiner (Parakeet, Zipformer) - requires joinerPath
   * - whisper: Encoder-Decoder only (OpenAI Whisper) - joinerPath not needed
   * Default: 'transducer'
   */
  modelArchitecture?: ModelArchitecture;

  /**
   * Whisper-specific: Target language for transcription
   * - Empty string "" for auto-detection from audio
   * - Language codes: "en", "fr", "de", "es", "zh", "ja", etc.
   * See: https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
   * Default: "" (auto-detect)
   * Only used when modelArchitecture is 'whisper'
   */
  whisperLanguage?: string;

  /**
   * Whisper-specific: Task type
   * - "transcribe": Transcribe speech to text in the original language
   * - "translate": Translate speech to English
   * Default: "transcribe"
   * Only used when modelArchitecture is 'whisper'
   */
  whisperTask?: 'transcribe' | 'translate';

  /**
   * Model type: 'streaming' for real-time, 'offline' for VAD-triggered batch
   * - streaming: Partial results as you speak (default)
   * - offline: Higher accuracy, processes after each speech segment
   * Note: Whisper models only support 'offline' mode
   */
  modelType?: ModelType; // Default: 'streaming'

  // VAD configuration (TEN-VAD) - structured config
  vadModelPath: string;
  vad: VADConfig;

  // Speaker diarization configuration (optional)
  diarizationModelPath?: string;
  /** Diarization configuration with dual thresholds */
  diarization?: Partial<DiarizationConfig>;

  // Audio configuration
  sampleRate?: number; // Default: 16000

  // Denoiser configuration (optional)
  denoiserModelPath?: string;

  /**
   * ONNX Runtime execution provider
   * - cpu: Default, always available
   * - nnapi: Android Neural Networks API (hardware accelerated)
   * - gpu: GPU inference
   * Use getAvailableProviders() to check what's available on the device
   */
  provider?: Provider; // Default: 'cpu'

  /**
   * Normalize transcript text case
   * - 'lowercase': Convert all text to lowercase (useful for Zipformer models that output UPPERCASE)
   * - 'uppercase': Convert all text to uppercase
   * - 'none': Keep original case from the model (default)
   */
  normalizeCase?: 'lowercase' | 'uppercase' | 'none'; // Default: 'none'

  /**
   * Sentence-level finalization for streaming mode.
   * Emits 'sentence' events when complete sentences are detected based on punctuation.
   * Useful for displaying finalized sentences during long-form speech.
   */
  sentenceFinalization?: Partial<SentenceFinalizationConfig>;
}

/**
 * Configuration for punctuation restoration model
 */
export interface PunctuationConfig {
  /** Full path to the CT-Transformer ONNX model file */
  modelPath: string;
  /** Number of threads for inference. Default: 2 */
  numThreads?: number;
  /** ONNX provider: 'cpu', 'nnapi', 'gpu', or 'coreml'. Default: 'cpu' */
  provider?: Provider;
}

/**
 * Result from speech recognition
 */
export interface STTResult {
  text: string;
  isFinal: boolean;
  startTime: number;
  endTime: number;
  confidence: number; // REQUIRED: 0-1
  processingTime: number; // REQUIRED: seconds
  audioDuration: number; // REQUIRED: seconds
  rtfx: number; // REQUIRED: audioDuration / processingTime (>1 = faster than real-time)
  speakerId?: number;
  speakerStatus?: SpeakerStatus;
  tokenTimings?: TokenTiming[];
}

/**
 * Streaming transcript update with volatile/confirmed state
 */
export interface StreamingTranscriptUpdate {
  volatile: string; // Current guess (may change)
  confirmed: string; // Locked in (won't change)
  fullText: string; // confirmed + volatile
  isFinal: boolean;
  confidence: number;
  processingTime: number;
  rtfx: number;
}

/**
 * STT Error event
 */
export interface STTError {
  code: string;
  message: string;
}

/**
 * VAD event data
 */
export interface VADEvent {
  state: VADState;
  speechProbability: number;
  speechDurationMs: number;
  silenceDurationMs: number;
}

/**
 * Speaker change event data
 */
export interface SpeakerEvent {
  speakerId: number;
  status: SpeakerStatus;
  justConfirmed: boolean;
  totalSpeakers: number;
  confidence: number;        // 0-1 match confidence (0 for new speakers)
  embeddingQuality: number;  // 0-1 embedding quality score
}

/**
 * Configuration for sentence-level finalization
 * Detects sentence boundaries based on punctuation during streaming
 */
export interface SentenceFinalizationConfig {
  /** Enable sentence-level finalization. Default: false */
  enabled: boolean;
  /** Characters that mark end of sentence. Default: ['.', '!', '?'] */
  sentenceEndChars?: string[];
  /** Minimum words before checking for sentence boundary. Default: 2 */
  minWordsBeforeCheck?: number;
  /** Include the punctuation character in the emitted sentence. Default: true */
  includePunctuation?: boolean;
}

/**
 * Sentence event data - emitted when a complete sentence is detected
 */
export interface SentenceEvent {
  /** The complete sentence text */
  text: string;
  /** Always true for sentence events */
  isFinal: true;
  /** Remaining text after the sentence (continues accumulating) */
  remaining: string;
  /** The punctuation character that ended the sentence */
  endChar: string;
  /** Speaker ID if diarization is enabled */
  speakerId?: number;
  /** Timestamp when sentence was finalized */
  timestamp: number;
}

/**
 * Event types for the STTManager.on() method
 */
export type STTEventType = 'transcript' | 'streaming' | 'vad' | 'speaker' | 'multiSpeaker' | 'sentence' | 'error';

/**
 * Default VAD configuration
 */
const defaultVADConfig: VADConfig = {
  threshold: 0.5,
  minSpeechDurationMs: 300,
  minSilenceDurationMs: 500,
  maxSpeechDurationMs: 30000,
  speechPaddingMs: 100,
  mode: 'normal',
};

/**
 * Default Diarization configuration
 * Note: Thresholds tuned for sherpa-onnx speaker embedding model
 */
const defaultDiarizationConfig: DiarizationConfig = {
  speakerThreshold: 0.55,      // Min similarity for speaker matching (lenient)
  embeddingThreshold: 0.70,    // Min similarity for profile update (strict)
  minSpeechDurationMs: 800,
  minEmbeddingUpdateMs: 500,
  maxSpeakers: 10,
  embeddingAverageWeight: 0.2,
  onboardingSegments: 3,       // First N segments always go to Speaker 1
  minSpeechForNewSpeaker: 3000, // Min ms before creating new speaker
};

/**
 * Default sentence finalization configuration
 */
const defaultSentenceFinalizationConfig: Required<SentenceFinalizationConfig> = {
  enabled: false,
  sentenceEndChars: ['.', '!', '?'],
  minWordsBeforeCheck: 2,
  includePunctuation: true,
};

/**
 * Utility class for detecting sentence boundaries in streaming text.
 * Can be used standalone or integrated with STTManager via sentenceFinalization config.
 *
 * @example
 * ```typescript
 * const detector = new SentenceBoundaryDetector();
 *
 * // Process streaming text chunks
 * const result1 = detector.process("Bonjour comment");
 * // result1 = null (no sentence boundary yet)
 *
 * const result2 = detector.process("allez-vous? Je vais");
 * // result2 = { sentence: "Bonjour comment allez-vous?", remaining: "Je vais", endChar: "?" }
 * ```
 */
export class SentenceBoundaryDetector {
  private buffer: string = '';
  private config: Required<SentenceFinalizationConfig>;

  constructor(config?: Partial<SentenceFinalizationConfig>) {
    this.config = {
      ...defaultSentenceFinalizationConfig,
      ...config,
      enabled: true, // Always enabled when using detector directly
    };
  }

  /**
   * Process a text chunk and detect sentence boundaries
   * @param text New text chunk to add to buffer
   * @returns Detected sentence info, or null if no complete sentence yet
   */
  process(text: string): { sentence: string; remaining: string; endChar: string } | null {
    // Append new text to buffer
    this.buffer = this.buffer ? `${this.buffer} ${text}`.trim() : text.trim();

    // Check minimum word count
    const wordCount = this.buffer.split(/\s+/).length;
    if (wordCount < this.config.minWordsBeforeCheck) {
      return null;
    }

    // Find sentence boundary
    const endCharsPattern = this.config.sentenceEndChars
      .map(c => c.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'))
      .join('|');

    // Match sentence ending followed by space and more text, or at end
    const regex = new RegExp(`^(.+?)(${endCharsPattern})(?:\\s+(.*))?$`);
    const match = this.buffer.match(regex);

    if (match) {
      const [, sentenceText, endChar, remaining = ''] = match;

      const sentence = this.config.includePunctuation
        ? `${sentenceText}${endChar}`.trim()
        : sentenceText.trim();

      // Update buffer with remaining text
      this.buffer = remaining.trim();

      return {
        sentence,
        remaining: this.buffer,
        endChar,
      };
    }

    return null;
  }

  /**
   * Get the current buffer content
   */
  getBuffer(): string {
    return this.buffer;
  }

  /**
   * Clear the buffer and return any remaining text
   */
  flush(): string {
    const remaining = this.buffer;
    this.buffer = '';
    return remaining;
  }

  /**
   * Reset the detector state
   */
  reset(): void {
    this.buffer = '';
  }

  /**
   * Check if text contains sentence-ending punctuation
   */
  static hasSentenceEnd(text: string, endChars: string[] = ['.', '!', '?']): boolean {
    return endChars.some(char => text.includes(char));
  }

  /**
   * Split text into sentences
   */
  static splitSentences(text: string, endChars: string[] = ['.', '!', '?']): string[] {
    const pattern = new RegExp(`([${endChars.map(c => '\\' + c).join('')}])\\s*`, 'g');
    const parts = text.split(pattern).filter(Boolean);

    const sentences: string[] = [];
    for (let i = 0; i < parts.length; i += 2) {
      const sentence = parts[i] + (parts[i + 1] || '');
      if (sentence.trim()) {
        sentences.push(sentence.trim());
      }
    }
    return sentences;
  }
}

/**
 * Create config JSON string for native module
 */
const createConfigJson = (config: STTConfig): string => {
  const vad = { ...defaultVADConfig, ...config.vad };
  const diarization = { ...defaultDiarizationConfig, ...config.diarization };
  const modelArchitecture = config.modelArchitecture ?? 'transducer';

  // Validate configuration
  if (modelArchitecture === 'transducer' && !config.joinerPath) {
    throw new Error('joinerPath is required for transducer architecture');
  }

  // Whisper only supports offline mode
  let modelType = config.modelType ?? 'streaming';
  if (modelArchitecture === 'whisper' && modelType === 'streaming') {
    console.warn('Whisper models only support offline mode. Switching to offline.');
    modelType = 'offline';
  }

  return JSON.stringify({
    encoderPath: config.encoderPath,
    decoderPath: config.decoderPath,
    joinerPath: config.joinerPath ?? '',
    tokensPath: config.tokensPath,
    modelArchitecture,
    modelType,
    // Whisper-specific config
    whisperLanguage: config.whisperLanguage ?? '', // Empty = auto-detect
    whisperTask: config.whisperTask ?? 'transcribe',
    vadModelPath: config.vadModelPath,
    vadConfig: {
      threshold: vad.threshold,
      minSpeechDurationMs: vad.minSpeechDurationMs,
      minSilenceDurationMs: vad.minSilenceDurationMs,
      maxSpeechDurationMs: vad.maxSpeechDurationMs,
      speechPaddingMs: vad.speechPaddingMs,
      mode: vad.mode,
    },
    diarizationModelPath: config.diarizationModelPath ?? null,
    diarization: {
      speakerThreshold: diarization.speakerThreshold,
      embeddingThreshold: diarization.embeddingThreshold,
      minSpeechDurationMs: diarization.minSpeechDurationMs,
      minEmbeddingUpdateMs: diarization.minEmbeddingUpdateMs,
      maxSpeakers: diarization.maxSpeakers,
      embeddingAverageWeight: diarization.embeddingAverageWeight,
      onboardingSegments: diarization.onboardingSegments,
      minSpeechForNewSpeaker: diarization.minSpeechForNewSpeaker,
    },
    sampleRate: config.sampleRate ?? 16000,
    denoiserModelPath: config.denoiserModelPath ?? '',
    provider: config.provider ?? 'cpu',
    normalizeCase: config.normalizeCase ?? 'none',
  });
};

/**
 * Event name mapping from friendly names to native event names
 * Note: 'sentence' is a virtual event handled at the TypeScript layer
 */
const eventNameMap: Record<STTEventType, string> = {
  transcript: 'TranscriptUpdate',
  streaming: 'StreamingTranscriptUpdate',
  vad: 'VADUpdate',
  speaker: 'SpeakerUpdate',
  multiSpeaker: 'MultiSpeakerDetected',
  sentence: '__sentence__', // Virtual event - processed in TypeScript
  error: 'STTError',
};

/**
 * STTManager - Class-based API for speech-to-text
 *
 * Usage:
 * ```typescript
 * import STTManager from 'react-native-sherpa-onnx-offline-stt';
 *
 * const manager = new STTManager();
 * await manager.initialize(config);
 * manager.on('transcript', (result) => console.log(result));
 * await manager.startRecording();
 * ```
 */
export class STTManager {
  private _initialized = false;
  private _recording = false;
  private listeners: Map<string, EmitterSubscription> = new Map();

  // Sentence finalization
  private sentenceDetector: SentenceBoundaryDetector | null = null;
  private sentenceFinalizationConfig: Required<SentenceFinalizationConfig> | null = null;
  private sentenceCallback: ((event: SentenceEvent) => void) | null = null;
  private currentSpeakerId: number | undefined = undefined;

  /**
   * Check if the STT engine is initialized
   */
  get initialized(): boolean {
    return this._initialized;
  }

  /**
   * Check if currently recording
   */
  get recording(): boolean {
    return this._recording;
  }

  /**
   * Initialize the STT engine with configuration
   * @param config STT configuration including model paths and VAD settings
   */
  async initialize(config: STTConfig): Promise<void> {
    const configJson = createConfigJson(config);
    await nativeSTT.initializeSTT(config.sampleRate ?? 16000, 1, configJson);
    this._initialized = true;

    // Set up sentence finalization if enabled
    if (config.sentenceFinalization?.enabled) {
      this.sentenceFinalizationConfig = {
        ...defaultSentenceFinalizationConfig,
        ...config.sentenceFinalization,
        enabled: true,
      };
      this.sentenceDetector = new SentenceBoundaryDetector(this.sentenceFinalizationConfig);
    }
  }

  /**
   * Start recording and transcribing audio from microphone
   * @throws Error if not initialized
   */
  async startRecording(): Promise<void> {
    if (!this._initialized) {
      throw new Error('STTManager not initialized. Call initialize() first.');
    }

    // Reset sentence detector for new session
    this.sentenceDetector?.reset();

    await nativeSTT.startRecording();
    this._recording = true;
  }

  /**
   * Stop recording and return final results
   * @returns Array of transcription results
   * @throws Error if not initialized
   */
  async stopRecording(): Promise<STTResult[]> {
    if (!this._initialized) {
      throw new Error('STTManager not initialized. Call initialize() first.');
    }
    const results = await nativeSTT.stopRecording();
    this._recording = false;

    // Flush remaining sentence buffer and emit if there's content
    if (this.sentenceDetector && this.sentenceCallback) {
      const remaining = this.sentenceDetector.flush();
      if (remaining.trim()) {
        this.sentenceCallback({
          text: remaining.trim(),
          isFinal: true,
          remaining: '',
          endChar: '', // No punctuation - flushed at end of session
          speakerId: this.currentSpeakerId,
          timestamp: Date.now(),
        });
      }
    }

    return results;
  }

  /**
   * Transcribe audio from a file
   * @param filePath Path to audio file (WAV format, 16kHz recommended)
   * @returns Array of transcription results
   * @throws Error if not initialized
   */
  async recognizeFile(filePath: string): Promise<STTResult[]> {
    if (!this._initialized) {
      throw new Error('STTManager not initialized. Call initialize() first.');
    }
    return nativeSTT.recognizeFile(filePath);
  }

  /**
   * Check if currently recording (async version)
   */
  async isRecordingAsync(): Promise<boolean> {
    return nativeSTT.isRecording();
  }

  /**
   * Get the number of detected speakers (diarization)
   */
  async getSpeakerCount(): Promise<number> {
    return nativeSTT.getSpeakerCount();
  }

  /**
   * Reset speaker profiles (start fresh diarization)
   */
  async resetSpeakers(): Promise<void> {
    return nativeSTT.resetSpeakers();
  }

  /**
   * Merge source speaker into target speaker.
   * Combines embeddings (weighted by segment count) and removes source.
   * Useful for correcting misidentifications.
   * @param sourceId Speaker ID to merge from (will be removed)
   * @param targetId Speaker ID to merge into (will be kept)
   * @returns true if successful, false if either speaker not found
   */
  async mergeSpeakers(sourceId: number, targetId: number): Promise<boolean> {
    return nativeSTT.mergeSpeakers(sourceId, targetId);
  }

  /**
   * Remove a speaker profile.
   * @param speakerId Speaker ID to remove
   * @returns true if speaker existed and was removed, false if not found
   */
  async removeSpeaker(speakerId: number): Promise<boolean> {
    return nativeSTT.removeSpeaker(speakerId);
  }

  /**
   * Get all speaker profiles.
   * Useful for debugging, displaying speaker stats, or exporting profiles.
   * @returns Array of speaker profile information
   */
  async getSpeakerProfiles(): Promise<SpeakerProfileInfo[]> {
    return nativeSTT.getSpeakerProfiles();
  }

  // =====================
  // Pyannote Segmentation
  // =====================

  /**
   * Initialize pyannote-based speaker segmentation.
   * This provides more accurate speaker change detection using the pyannote model.
   *
   * Model download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
   * Use sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
   *
   * @param config Pyannote segmentation configuration
   * @returns true if initialization successful
   */
  async initializePyannoteSegmentation(config: PyannoteSegmentationConfig): Promise<boolean> {
    return nativeSTT.initializePyannoteSegmentation(JSON.stringify({
      segmentationModelPath: config.segmentationModelPath,
      embeddingModelPath: config.embeddingModelPath,
      numSpeakers: config.numSpeakers ?? 0,
      clusteringThreshold: config.clusteringThreshold ?? 0.5,
      minDurationOn: config.minDurationOn ?? 0.5,
      minDurationOff: config.minDurationOff ?? 0.3,
      numThreads: config.numThreads ?? 2,
      provider: config.provider ?? 'cpu',
    }));
  }

  /**
   * Set the diarization mode.
   * @param mode 'embedding' (default) or 'pyannote'
   * @returns true if mode was set successfully
   */
  async setDiarizationMode(mode: DiarizationMode): Promise<boolean> {
    return nativeSTT.setDiarizationMode(mode);
  }

  /**
   * Get the current diarization mode.
   * @returns Current mode: 'embedding' or 'pyannote'
   */
  async getDiarizationMode(): Promise<DiarizationMode> {
    return nativeSTT.getDiarizationMode();
  }

  /**
   * Process an audio file with pyannote diarization.
   * Returns detailed speaker segments with timestamps.
   *
   * Requires pyannote segmentation to be initialized first via initializePyannoteSegmentation().
   *
   * @param filePath Path to WAV audio file (16kHz mono recommended)
   * @returns Diarization result with segments and speaker info
   */
  async diarizeFile(filePath: string): Promise<DiarizationResult> {
    return nativeSTT.diarizeFile(filePath);
  }

  /**
   * Set expected number of speakers for pyannote diarization.
   * Use 0 for auto-detection based on clustering threshold.
   *
   * @param numSpeakers Number of expected speakers (0 for auto)
   */
  async setPyannoteNumSpeakers(numSpeakers: number): Promise<boolean> {
    return nativeSTT.setPyannoteNumSpeakers(numSpeakers);
  }

  /**
   * Set clustering threshold for pyannote auto speaker detection.
   * Only used when numSpeakers = 0.
   * Smaller values = more speakers detected.
   *
   * @param threshold Clustering distance threshold (default 0.5)
   */
  async setPyannoteClusteringThreshold(threshold: number): Promise<boolean> {
    return nativeSTT.setPyannoteClusteringThreshold(threshold);
  }

  /**
   * Check if pyannote segmentation is initialized.
   */
  async isPyannoteInitialized(): Promise<boolean> {
    return nativeSTT.isPyannoteInitialized();
  }

  // =====================
  // Audio Capture (Debug)
  // =====================

  /**
   * Start capturing audio to a WAV file.
   * @param filePath Absolute path to save the WAV file
   * @param captureType Type of audio to capture:
   *   - 'raw': Direct microphone audio before any processing
   *   - 'processed': Audio after denoiser, just before STT model
   * @returns true if capture started successfully
   */
  async startAudioCapture(filePath: string, captureType: AudioCaptureType = 'raw'): Promise<boolean> {
    return nativeSTT.startAudioCapture(filePath, captureType);
  }

  /**
   * Stop audio capture and finalize the WAV file.
   * @param captureType Type of capture to stop ('raw', 'processed', or 'both')
   * @returns true if capture was stopped and file was finalized
   */
  async stopAudioCapture(captureType: AudioCaptureType | 'both' = 'both'): Promise<boolean> {
    if (captureType === 'both') {
      return nativeSTT.stopAllAudioCapture();
    }
    return nativeSTT.stopAudioCapture(captureType);
  }

  /**
   * Stop all audio captures (both raw and processed).
   * @returns true if all captures were stopped
   */
  async stopAllAudioCapture(): Promise<boolean> {
    return nativeSTT.stopAllAudioCapture();
  }

  /**
   * Check if audio capture is currently active.
   * @param captureType Type to check ('raw', 'processed', or 'any')
   * @returns true if specified capture type is active
   */
  async isAudioCaptureActive(captureType: AudioCaptureType | 'any' = 'any'): Promise<boolean> {
    return nativeSTT.isAudioCaptureActive(captureType);
  }

  /**
   * Get the status of all audio captures.
   * @returns Object with rawEnabled and processedEnabled booleans
   */
  async getAudioCaptureStatus(): Promise<AudioCaptureStatus> {
    return nativeSTT.getAudioCaptureStatus();
  }

  /**
   * Get the capture directory path (app's cache directory).
   * Use this to construct file paths for audio capture.
   * @returns Absolute path to the capture directory
   */
  async getCaptureDirectory(): Promise<string> {
    return nativeSTT.getCaptureDirectory();
  }

  /**
   * List all captured audio files in the cache directory.
   * @returns Array of captured file info (path, name, size, lastModified)
   */
  async listCapturedFiles(): Promise<CapturedFileInfo[]> {
    return nativeSTT.listCapturedFiles();
  }

  /**
   * Delete a captured audio file.
   * @param filePath Path to the file to delete
   * @returns true if file was deleted
   */
  async deleteCapturedFile(filePath: string): Promise<boolean> {
    return nativeSTT.deleteCapturedFile(filePath);
  }

  /**
   * Enable or disable the denoiser
   * @param enabled Whether to enable the denoiser
   */
  async setDenoiserEnabled(enabled: boolean): Promise<boolean> {
    return nativeSTT.setDenoiserEnabled(enabled);
  }

  /**
   * Check if denoiser is currently enabled
   */
  async isDenoiserEnabled(): Promise<boolean> {
    return nativeSTT.isDenoiserEnabled();
  }

  // =====================
  // Echo Cancellation (AEC)
  // =====================

  /**
   * Check if Acoustic Echo Cancellation is available on this device.
   * AEC availability depends on the device hardware and Android version.
   * @returns true if AEC is available
   */
  async isAecAvailable(): Promise<boolean> {
    return nativeSTT.isAecAvailable();
  }

  /**
   * Enable or disable Acoustic Echo Cancellation.
   * AEC helps prevent feedback/echo when playing audio through speakers
   * while recording from the microphone (e.g., WebRTC + STT scenarios).
   *
   * Note: AEC is enabled by default. Changes take effect on next recording start.
   *
   * @param enabled Whether to enable AEC
   * @returns true if setting was applied
   */
  async setAecEnabled(enabled: boolean): Promise<boolean> {
    return nativeSTT.setAecEnabled(enabled);
  }

  /**
   * Check if AEC is currently enabled.
   * @returns true if AEC is enabled (will be applied when recording starts)
   */
  async isAecEnabled(): Promise<boolean> {
    return nativeSTT.isAecEnabled();
  }

  /**
   * Set the audio source for recording.
   * Different sources have different characteristics:
   *
   * - `voice_communication`: Optimized for VoIP with AEC support (default).
   *   Best for scenarios where you're playing audio through speakers while recording.
   *
   * - `voice_recognition`: Optimized for speech recognition without VoIP.
   *   Better quality for pure STT without echo concerns.
   *
   * - `mic`: Default microphone input. Raw audio without preprocessing.
   *
   * Note: Changes take effect on next recording start.
   *
   * @param source Audio source: 'voice_communication' | 'voice_recognition' | 'mic'
   * @returns true if setting was applied
   */
  async setAudioSource(source: AudioSource): Promise<boolean> {
    return nativeSTT.setAudioSource(source);
  }

  /**
   * Get the current audio source setting.
   * @returns Current audio source
   */
  async getAudioSource(): Promise<AudioSource> {
    return nativeSTT.getAudioSource();
  }

  // =====================
  // Punctuation Methods
  // =====================

  /**
   * Initialize the punctuation restoration model
   * Must be called before using addPunctuation()
   *
   * @param config Punctuation configuration with model path
   * @returns true if initialization successful
   *
   * @example
   * ```typescript
   * await sttManager.initializePunctuation({
   *   modelPath: '/path/to/ct_transformer_fr.int8.onnx',
   *   numThreads: 2,
   *   provider: 'cpu',
   * });
   * ```
   */
  async initializePunctuation(config: PunctuationConfig): Promise<boolean> {
    const configWithDefaults = {
      modelPath: config.modelPath,
      numThreads: config.numThreads ?? 2,
      provider: config.provider ?? 'cpu',
    };
    return nativeSTT.initializePunctuation(JSON.stringify(configWithDefaults));
  }

  /**
   * Add punctuation to unpunctuated text
   * Requires initializePunctuation() to be called first
   *
   * @param text Unpunctuated text (e.g., "bonjour comment allez vous")
   * @returns Punctuated text (e.g., "Bonjour, comment allez-vous?")
   *
   * @example
   * ```typescript
   * const punctuated = await sttManager.addPunctuation("bonjour comment allez vous");
   * console.log(punctuated); // "Bonjour, comment allez-vous?"
   * ```
   */
  async addPunctuation(text: string): Promise<string> {
    return nativeSTT.addPunctuation(text);
  }

  /**
   * Check if punctuation model is initialized
   */
  async isPunctuationInitialized(): Promise<boolean> {
    return nativeSTT.isPunctuationInitialized();
  }

  /**
   * Enable or disable punctuation processing
   * @param enabled Whether to enable punctuation
   */
  async setPunctuationEnabled(enabled: boolean): Promise<boolean> {
    return nativeSTT.setPunctuationEnabled(enabled);
  }

  /**
   * Check if punctuation is currently enabled
   */
  async isPunctuationEnabled(): Promise<boolean> {
    return nativeSTT.isPunctuationEnabled();
  }

  // =====================
  // Sentence Finalization Methods
  // =====================

  /**
   * Reset the sentence detector buffer.
   * Useful when changing context or starting a new paragraph.
   */
  resetSentenceDetector(): void {
    this.sentenceDetector?.reset();
  }

  /**
   * Get the current sentence buffer content.
   * @returns Current accumulated text that hasn't been emitted as a sentence yet
   */
  getSentenceBuffer(): string {
    return this.sentenceDetector?.getBuffer() ?? '';
  }

  /**
   * Flush the sentence buffer and get remaining text.
   * Does NOT emit a sentence event - use this for manual processing.
   * @returns Remaining text that was in the buffer
   */
  flushSentenceBuffer(): string {
    return this.sentenceDetector?.flush() ?? '';
  }

  /**
   * Check if sentence finalization is enabled
   */
  isSentenceFinalizationEnabled(): boolean {
    return this.sentenceDetector !== null;
  }

  /**
   * Enable sentence finalization dynamically (after initialization).
   * Useful for toggling sentence detection on/off during a session.
   * @param config Optional configuration, uses defaults if not provided
   */
  enableSentenceFinalization(config?: Partial<SentenceFinalizationConfig>): void {
    this.sentenceFinalizationConfig = {
      ...defaultSentenceFinalizationConfig,
      ...config,
      enabled: true,
    };
    this.sentenceDetector = new SentenceBoundaryDetector(this.sentenceFinalizationConfig);
  }

  /**
   * Disable sentence finalization.
   * The sentence callback will no longer receive events.
   * @returns Any remaining text in the buffer before disabling
   */
  disableSentenceFinalization(): string {
    const remaining = this.sentenceDetector?.flush() ?? '';
    this.sentenceDetector = null;
    this.sentenceFinalizationConfig = null;
    return remaining;
  }

  /**
   * Get the current model type (streaming or offline)
   */
  async getModelType(): Promise<ModelType> {
    return nativeSTT.getModelType();
  }

  /**
   * Start background service for recording when app is in background
   * Shows a notification while active
   */
  async startBackgroundService(): Promise<boolean> {
    return nativeSTT.startBackgroundService();
  }

  /**
   * Stop background service
   */
  async stopBackgroundService(): Promise<boolean> {
    return nativeSTT.stopBackgroundService();
  }

  /**
   * Check if background service is currently running
   */
  async isBackgroundServiceRunning(): Promise<boolean> {
    return nativeSTT.isBackgroundServiceRunning();
  }

  /**
   * Subscribe to transcript events
   */
  on(event: 'transcript', callback: (result: STTResult) => void): this;
  /**
   * Subscribe to streaming transcript events (volatile/confirmed)
   */
  on(event: 'streaming', callback: (update: StreamingTranscriptUpdate) => void): this;
  /**
   * Subscribe to VAD state change events
   */
  on(event: 'vad', callback: (event: VADEvent) => void): this;
  /**
   * Subscribe to speaker change events (diarization)
   */
  on(event: 'speaker', callback: (event: SpeakerEvent) => void): this;
  /**
   * Subscribe to multi-speaker detection events
   * Emitted when multiple speakers are detected in a single VAD segment
   */
  on(event: 'multiSpeaker', callback: (event: MultiSpeakerEvent) => void): this;
  /**
   * Subscribe to sentence events (sentence-level finalization)
   * Emitted when a complete sentence is detected based on punctuation.
   * Requires sentenceFinalization.enabled in config.
   */
  on(event: 'sentence', callback: (event: SentenceEvent) => void): this;
  /**
   * Subscribe to error events
   */
  on(event: 'error', callback: (error: STTError) => void): this;
  /**
   * Subscribe to STT events
   * @param event Event type: 'transcript' | 'streaming' | 'vad' | 'speaker' | 'error'
   * @param callback Callback function for the event
   * @returns this (for chaining)
   */
  on(event: STTEventType, callback: Function): this {
    // Remove existing listener for this event type if any
    this.off(event);

    const nativeEventName = eventNameMap[event];
    let wrappedCallback: (eventData: any) => void;

    switch (event) {
      case 'transcript':
        wrappedCallback = (eventData: any) => {
          (callback as (result: STTResult) => void)({
            text: eventData.text,
            isFinal: eventData.isFinal,
            startTime: eventData.startTime,
            endTime: eventData.endTime,
            confidence: eventData.confidence,
            processingTime: eventData.processingTime,
            audioDuration: eventData.audioDuration,
            rtfx: eventData.rtfx,
            speakerId: eventData.speakerId,
            speakerStatus: eventData.speakerStatus,
            tokenTimings: eventData.tokenTimings,
          });
        };
        break;
      case 'streaming':
        wrappedCallback = (eventData: any) => {
          (callback as (update: StreamingTranscriptUpdate) => void)({
            volatile: eventData.volatile,
            confirmed: eventData.confirmed,
            fullText: eventData.fullText,
            isFinal: eventData.isFinal,
            confidence: eventData.confidence,
            processingTime: eventData.processingTime,
            rtfx: eventData.rtfx,
          });
        };
        break;
      case 'vad':
        wrappedCallback = (eventData: any) => {
          (callback as (event: VADEvent) => void)({
            state: eventData.state,
            speechProbability: eventData.speechProbability,
            speechDurationMs: eventData.speechDurationMs,
            silenceDurationMs: eventData.silenceDurationMs,
          });
        };
        break;
      case 'speaker':
        wrappedCallback = (eventData: any) => {
          (callback as (event: SpeakerEvent) => void)({
            speakerId: eventData.speakerId,
            status: eventData.status,
            justConfirmed: eventData.justConfirmed,
            totalSpeakers: eventData.totalSpeakers,
            confidence: eventData.confidence ?? 0,
            embeddingQuality: eventData.embeddingQuality ?? 0,
          });
        };
        break;
      case 'multiSpeaker':
        wrappedCallback = (eventData: any) => {
          (callback as (event: MultiSpeakerEvent) => void)({
            speakers: eventData.speakers,
            windows: eventData.windows?.map((w: any) => ({
              timestampMs: w.timestampMs,
              speakerId: w.speakerId,
            })) ?? [],
            speakerCount: eventData.speakerCount,
          });
        };
        break;
      case 'sentence':
        // Sentence event is a virtual event - we subscribe to streaming internally
        if (!this.sentenceDetector) {
          console.warn(
            'STTManager: sentence event requires sentenceFinalization.enabled in config'
          );
          // Store callback anyway in case user enables it later
          this.sentenceCallback = callback as (event: SentenceEvent) => void;
          return this;
        }
        this.sentenceCallback = callback as (event: SentenceEvent) => void;

        // Subscribe to streaming events and process through sentence detector
        wrappedCallback = (eventData: any) => {
          // Track speaker ID from streaming events
          if (eventData.speakerId !== undefined) {
            this.currentSpeakerId = eventData.speakerId;
          }

          // Process text through sentence detector
          const text = eventData.fullText || eventData.volatile || '';
          if (text && this.sentenceDetector) {
            const result = this.sentenceDetector.process(text);
            if (result && this.sentenceCallback) {
              this.sentenceCallback({
                text: result.sentence,
                isFinal: true,
                remaining: result.remaining,
                endChar: result.endChar,
                speakerId: this.currentSpeakerId,
                timestamp: Date.now(),
              });
            }
          }
        };

        // Subscribe to streaming events for sentence processing
        const streamingSubscription = emitter.addListener(
          eventNameMap['streaming'],
          wrappedCallback
        );
        this.listeners.set('sentence', streamingSubscription);
        return this;
      case 'error':
        wrappedCallback = (eventData: any) => {
          (callback as (error: STTError) => void)({
            code: eventData.code,
            message: eventData.message,
          });
        };
        break;
      default:
        throw new Error(`Unknown event type: ${event}`);
    }

    const subscription = emitter.addListener(nativeEventName, wrappedCallback);
    this.listeners.set(event, subscription);

    return this;
  }

  /**
   * Unsubscribe from an event
   * @param event Event type to unsubscribe from
   * @returns this (for chaining)
   */
  off(event: STTEventType): this {
    const subscription = this.listeners.get(event);
    if (subscription) {
      subscription.remove();
      this.listeners.delete(event);
    }
    return this;
  }

  /**
   * Clean up native resources.
   * Note: Event listeners are preserved to support re-initialization with a different mode.
   * Call removeAllListeners() explicitly if you want to remove listeners.
   */
  async deinitialize(): Promise<void> {
    // Clean up native resources
    nativeSTT.deinitialize();

    // Reset sentence detection state
    this.sentenceDetector?.reset();
    this.sentenceDetector = null;
    this.sentenceFinalizationConfig = null;
    this.sentenceCallback = null;
    this.currentSpeakerId = undefined;

    this._initialized = false;
    this._recording = false;
  }

  /**
   * Remove all event listeners.
   * Call this only when you're completely done with STT and won't reinitialize.
   */
  removeAllListeners(): void {
    this.listeners.forEach((subscription) => subscription.remove());
    this.listeners.clear();
  }

  /**
   * Get available ONNX Runtime providers on this device
   * Returns device info, available providers, and recommended provider
   */
  static async getAvailableProviders(): Promise<DeviceProvidersInfo> {
    return nativeSTT.getAvailableProviders();
  }

  /**
   * Get the current platform
   */
  static get platform(): string {
    return Platform.OS;
  }
}

// Re-export EmitterSubscription from react-native
export type { EmitterSubscription };

// Default export
export default STTManager;
