import { ResourceSource } from './common';
import { RnExecutorchError } from '../errors/errorUtils';

/**
 * Named Speech to Text model variants.
 * @category Types
 */
export type SpeechToTextModelName =
  | 'whisper-tiny-en'
  | 'whisper-tiny-en-quantized'
  | 'whisper-base-en'
  | 'whisper-base-en-quantized'
  | 'whisper-small-en'
  | 'whisper-small-en-quantized'
  | 'whisper-tiny'
  | 'whisper-base'
  | 'whisper-small';

/**
 * Configuration for Speech to Text model.
 * @category Types
 */
export interface SpeechToTextProps {
  /**
   * Configuration object containing model sources.
   */
  model: SpeechToTextModelConfig; // | ...
  /**
   * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
   */
  preventLoad?: boolean;
}

/**
 * React hook for managing Speech to Text (STT) instance.
 * @category Types
 */
export interface SpeechToTextType {
  /**
   * Contains the error message if the model failed to load.
   */
  error: null | RnExecutorchError;

  /**
   * Indicates whether the model has successfully loaded and is ready for inference.
   */
  isReady: boolean;

  /**
   * Indicates whether the model is currently processing an inference.
   */
  isGenerating: boolean;

  /**
   * Tracks the progress of the model download process.
   */
  downloadProgress: number;

  /**
   * Runs the encoding part of the model on the provided waveform.
   * @param waveform - The input audio waveform array.
   * @returns A promise resolving to the encoded data.
   */
  encode(waveform: Float32Array): Promise<Float32Array>;

  /**
   * Runs the decoder of the model.
   * @param tokens - The encoded audio data.
   * @param encoderOutput - The output from the encoder.
   * @returns A promise resolving to the decoded text.
   */
  decode(
    tokens: Int32Array,
    encoderOutput: Float32Array
  ): Promise<Float32Array>;

  /**
   * Starts a transcription process for a given input array, which should be a waveform at 16kHz.
   * @param waveform - The input audio waveform.
   * @param options - Decoding options, check API reference for more details.
   * @returns Resolves a promise with the output transcription. Result of transcription is
   * object of type `TranscriptionResult`.
   */
  transcribe(
    waveform: Float32Array,
    options?: DecodingOptions | undefined
  ): Promise<TranscriptionResult>;

  /**
   * Starts a streaming transcription process.
   * Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
   * Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.
   * @param options - Decoding options including language.
   * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
   * Both `committed` and `nonCommitted` are of type `TranscriptionResult`
   */
  stream(options?: DecodingOptions | undefined): AsyncGenerator<
    {
      committed: TranscriptionResult;
      nonCommitted: TranscriptionResult;
    },
    void,
    unknown
  >;

  /**
   * Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
   * @param waveform - The audio chunk to insert.
   */
  streamInsert(waveform: Float32Array): void;

  /**
   * Stops the ongoing streaming transcription process.
   */
  streamStop(): void;
}

/**
 * Languages supported by whisper (not whisper.en)
 * @category Types
 */
export type SpeechToTextLanguage =
  | 'af'
  | 'sq'
  | 'ar'
  | 'hy'
  | 'az'
  | 'eu'
  | 'be'
  | 'bn'
  | 'bs'
  | 'bg'
  | 'my'
  | 'ca'
  | 'zh'
  | 'hr'
  | 'cs'
  | 'da'
  | 'nl'
  | 'et'
  | 'en'
  | 'fi'
  | 'fr'
  | 'gl'
  | 'ka'
  | 'de'
  | 'el'
  | 'gu'
  | 'ht'
  | 'he'
  | 'hi'
  | 'hu'
  | 'is'
  | 'id'
  | 'it'
  | 'ja'
  | 'kn'
  | 'kk'
  | 'km'
  | 'ko'
  | 'lo'
  | 'lv'
  | 'lt'
  | 'mk'
  | 'mg'
  | 'ms'
  | 'ml'
  | 'mt'
  | 'mr'
  | 'ne'
  | 'no'
  | 'fa'
  | 'pl'
  | 'pt'
  | 'pa'
  | 'ro'
  | 'ru'
  | 'sr'
  | 'si'
  | 'sk'
  | 'sl'
  | 'es'
  | 'su'
  | 'sw'
  | 'sv'
  | 'tl'
  | 'tg'
  | 'ta'
  | 'te'
  | 'th'
  | 'tr'
  | 'uk'
  | 'ur'
  | 'uz'
  | 'vi'
  | 'cy'
  | 'yi';

/**
 * Options for decoding speech to text.
 * @category Types
 * @property {SpeechToTextLanguage} [language] - Optional language code to guide the transcription.
 * @property {boolean} [verbose] - Optional flag. If set, transcription result is presented with timestamps
 * and with additional parameters. For more details please refer to `TranscriptionResult`.
 */
export interface DecodingOptions {
  language?: SpeechToTextLanguage;
  verbose?: boolean;
}

/**
 * Structure that represent single token with timestamp information.
 * @category Types
 * @property {string} [word] - Token as a string value.
 * @property {number} [start] - Timestamp of the beginning of the token in audio (in seconds).
 * @property {number} [end] - Timestamp of the end of the token in audio (in seconds).
 */
export interface Word {
  word: string;
  start: number;
  end: number;
}

/**
 * Structure that represent single Segment of transcription.
 * @category Types
 * @property {number} [start] - Timestamp of the beginning of the segment in audio (in seconds).
 * @property {number} [end] - Timestamp of the end of the segment in audio (in seconds).
 * @property {string} [text] - Full text of the given segment as a string.
 * @property {Word[]} [words] - If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
 * as an array of `Word`.
 * @property {number[]} [tokens] - Raw tokens represented as table of integers.
 * @property {number} [temperature] - Temperature for which given segment was computed.
 * @property {number} [avgLogprob] - Average log probability calculated across all tokens in a segment.
 * @property {number} [compressionRatio] - Compression ration achieved on a given segment.
 */
export interface TranscriptionSegment {
  start: number;
  end: number;
  text: string;
  words?: Word[];
  tokens: number[];
  temperature: number;
  avgLogprob: number;
  compressionRatio: number;
}

/**
 * Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
 * @category Types
 * @property {'transcribe' | 'stream'} [task] - String indicating task, either 'transcribe' or 'stream'.
 * @property {string} [language] - Language chosen for transcription.
 * @property {number} [duration] - Duration in seconds of a given transcription.
 * @property {string} [text] - The whole text of a transcription as a `string`.
 * @property {TranscriptionSegment[]} [segments] - If `verbose` set to `true` in `DecodingOptions`, it contains array of
 * `TranscriptionSegment` with details split into separate transcription segments.
 */
export interface TranscriptionResult {
  task?: 'transcribe' | 'stream';
  language: string;
  duration: number;
  text: string;
  segments?: TranscriptionSegment[]; // Present if verbose=true
}

/**
 * Configuration for Speech to Text model.
 * @category Types
 */
export interface SpeechToTextModelConfig {
  /**
   * The built-in model name (e.g. `'whisper-tiny-en'`). Used for telemetry and hook reload triggers.
   * Pass one of the pre-built STT constants (e.g. `WHISPER_TINY_EN`) to populate all required fields.
   */
  modelName: SpeechToTextModelName;

  /**
   * A boolean flag indicating whether the model supports multiple languages.
   */
  isMultilingual: boolean;

  /**
   * A string that specifies the location of a `.pte` file for the model.
   *
   * We expect the model to have 2 bundled methods: 'decode' and 'encode'.
   */
  modelSource: ResourceSource;

  /**
   * A string that specifies the location to the tokenizer for the model.
   */
  tokenizerSource: ResourceSource;
}
