// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

import { APIResource } from '../../core/resource';
import * as ClientSecretsAPI from './client-secrets';
import * as RealtimeAPI from './realtime';
import * as ResponsesAPI from '../responses/responses';
import { APIPromise } from '../../core/api-promise';
import { RequestOptions } from '../../internal/request-options';

export class ClientSecrets extends APIResource {
  /**
   * Create a Realtime client secret with an associated session configuration.
   */
  create(body: ClientSecretCreateParams, options?: RequestOptions): APIPromise<ClientSecretCreateResponse> {
    return this._client.post('/realtime/client_secrets', { body, ...options });
  }
}

/**
 * Ephemeral key returned by the API.
 */
export interface RealtimeSessionClientSecret {
  /**
   * Timestamp for when the token expires. Currently, all tokens expire after one
   * minute.
   */
  expires_at: number;

  /**
   * Ephemeral key usable in client environments to authenticate connections to the
   * Realtime API. Use this in client-side environments rather than a standard API
   * token, which should only be used server-side.
   */
  value: string;
}

/**
 * A new Realtime session configuration, with an ephemeral key. Default TTL for
 * keys is one minute.
 */
export interface RealtimeSessionCreateResponse {
  /**
   * Ephemeral key returned by the API.
   */
  client_secret: RealtimeSessionClientSecret;

  /**
   * The type of session to create. Always `realtime` for the Realtime API.
   */
  type: 'realtime';

  /**
   * Configuration for input and output audio.
   */
  audio?: RealtimeSessionCreateResponse.Audio;

  /**
   * Additional fields to include in server outputs.
   *
   * `item.input_audio_transcription.logprobs`: Include logprobs for input audio
   * transcription.
   */
  include?: Array<'item.input_audio_transcription.logprobs'>;

  /**
   * The default system instructions (i.e. system message) prepended to model calls.
   * This field allows the client to guide the model on desired responses. The model
   * can be instructed on response content and format, (e.g. "be extremely succinct",
   * "act friendly", "here are examples of good responses") and on audio behavior
   * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
   * instructions are not guaranteed to be followed by the model, but they provide
   * guidance to the model on the desired behavior.
   *
   * Note that the server sets default instructions which will be used if this field
   * is not set and are visible in the `session.created` event at the start of the
   * session.
   */
  instructions?: string;

  /**
   * Maximum number of output tokens for a single assistant response, inclusive of
   * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
   * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
   */
  max_output_tokens?: number | 'inf';

  /**
   * The Realtime model used for this session.
   */
  model?:
    | (string & {})
    | 'gpt-realtime'
    | 'gpt-realtime-2025-08-28'
    | 'gpt-4o-realtime-preview'
    | 'gpt-4o-realtime-preview-2024-10-01'
    | 'gpt-4o-realtime-preview-2024-12-17'
    | 'gpt-4o-realtime-preview-2025-06-03'
    | 'gpt-4o-mini-realtime-preview'
    | 'gpt-4o-mini-realtime-preview-2024-12-17';

  /**
   * The set of modalities the model can respond with. It defaults to `["audio"]`,
   * indicating that the model will respond with audio plus a transcript. `["text"]`
   * can be used to make the model respond with text only. It is not possible to
   * request both `text` and `audio` at the same time.
   */
  output_modalities?: Array<'text' | 'audio'>;

  /**
   * Reference to a prompt template and its variables.
   * [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
   */
  prompt?: ResponsesAPI.ResponsePrompt | null;

  /**
   * How the model chooses tools. Provide one of the string modes or force a specific
   * function/MCP tool.
   */
  tool_choice?: ResponsesAPI.ToolChoiceOptions | ResponsesAPI.ToolChoiceFunction | ResponsesAPI.ToolChoiceMcp;

  /**
   * Tools available to the model.
   */
  tools?: Array<RealtimeAPI.RealtimeFunctionTool | RealtimeSessionCreateResponse.McpTool>;

  /**
   * Realtime API can write session traces to the
   * [Traces Dashboard](/logs?api=traces). Set to null to disable tracing. Once
   * tracing is enabled for a session, the configuration cannot be modified.
   *
   * `auto` will create a trace for the session with default values for the workflow
   * name, group id, and metadata.
   */
  tracing?: 'auto' | RealtimeSessionCreateResponse.TracingConfiguration | null;

  /**
   * Controls how the realtime conversation is truncated prior to model inference.
   * The default is `auto`.
   */
  truncation?: RealtimeAPI.RealtimeTruncation;
}

export namespace RealtimeSessionCreateResponse {
  /**
   * Configuration for input and output audio.
   */
  export interface Audio {
    input?: Audio.Input;

    output?: Audio.Output;
  }

  export namespace Audio {
    export interface Input {
      /**
       * The format of the input audio.
       */
      format?: RealtimeAPI.RealtimeAudioFormats;

      /**
       * Configuration for input audio noise reduction. This can be set to `null` to turn
       * off. Noise reduction filters audio added to the input audio buffer before it is
       * sent to VAD and the model. Filtering the audio can improve VAD and turn
       * detection accuracy (reducing false positives) and model performance by improving
       * perception of the input audio.
       */
      noise_reduction?: Input.NoiseReduction;

      /**
       * Configuration for input audio transcription, defaults to off and can be set to
       * `null` to turn off once on. Input audio transcription is not native to the
       * model, since the model consumes audio directly. Transcription runs
       * asynchronously through
       * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
       * and should be treated as guidance of input audio content rather than precisely
       * what the model heard. The client can optionally set the language and prompt for
       * transcription, these offer additional guidance to the transcription service.
       */
      transcription?: RealtimeAPI.AudioTranscription;

      /**
       * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
       * set to `null` to turn off, in which case the client must manually trigger model
       * response. Server VAD means that the model will detect the start and end of
       * speech based on audio volume and respond at the end of user speech. Semantic VAD
       * is more advanced and uses a turn detection model (in conjunction with VAD) to
       * semantically estimate whether the user has finished speaking, then dynamically
       * sets a timeout based on this probability. For example, if user audio trails off
       * with "uhhm", the model will score a low probability of turn end and wait longer
       * for the user to continue speaking. This can be useful for more natural
       * conversations, but may have a higher latency.
       */
      turn_detection?: Input.TurnDetection;
    }

    export namespace Input {
      /**
       * Configuration for input audio noise reduction. This can be set to `null` to turn
       * off. Noise reduction filters audio added to the input audio buffer before it is
       * sent to VAD and the model. Filtering the audio can improve VAD and turn
       * detection accuracy (reducing false positives) and model performance by improving
       * perception of the input audio.
       */
      export interface NoiseReduction {
        /**
         * Type of noise reduction. `near_field` is for close-talking microphones such as
         * headphones, `far_field` is for far-field microphones such as laptop or
         * conference room microphones.
         */
        type?: RealtimeAPI.NoiseReductionType;
      }

      /**
       * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
       * set to `null` to turn off, in which case the client must manually trigger model
       * response. Server VAD means that the model will detect the start and end of
       * speech based on audio volume and respond at the end of user speech. Semantic VAD
       * is more advanced and uses a turn detection model (in conjunction with VAD) to
       * semantically estimate whether the user has finished speaking, then dynamically
       * sets a timeout based on this probability. For example, if user audio trails off
       * with "uhhm", the model will score a low probability of turn end and wait longer
       * for the user to continue speaking. This can be useful for more natural
       * conversations, but may have a higher latency.
       */
      export interface TurnDetection {
        /**
         * Whether or not to automatically generate a response when a VAD stop event
         * occurs.
         */
        create_response?: boolean;

        /**
         * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
         * will wait longer for the user to continue speaking, `high` will respond more
         * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
         * and `high` have max timeouts of 8s, 4s, and 2s respectively.
         */
        eagerness?: 'low' | 'medium' | 'high' | 'auto';

        /**
         * Optional idle timeout after which turn detection will auto-timeout when no
         * additional audio is received and emits a `timeout_triggered` event.
         */
        idle_timeout_ms?: number | null;

        /**
         * Whether or not to automatically interrupt any ongoing response with output to
         * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
         * occurs.
         */
        interrupt_response?: boolean;

        /**
         * Used only for `server_vad` mode. Amount of audio to include before the VAD
         * detected speech (in milliseconds). Defaults to 300ms.
         */
        prefix_padding_ms?: number;

        /**
         * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
         * milliseconds). Defaults to 500ms. With shorter values the model will respond
         * more quickly, but may jump in on short pauses from the user.
         */
        silence_duration_ms?: number;

        /**
         * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
         * defaults to 0.5. A higher threshold will require louder audio to activate the
         * model, and thus might perform better in noisy environments.
         */
        threshold?: number;

        /**
         * Type of turn detection.
         */
        type?: 'server_vad' | 'semantic_vad';
      }
    }

    export interface Output {
      /**
       * The format of the output audio.
       */
      format?: RealtimeAPI.RealtimeAudioFormats;

      /**
       * The speed of the model's spoken response as a multiple of the original speed.
       * 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
       * This value can only be changed in between model turns, not while a response is
       * in progress.
       *
       * This parameter is a post-processing adjustment to the audio after it is
       * generated, it's also possible to prompt the model to speak faster or slower.
       */
      speed?: number;

      /**
       * The voice the model uses to respond. Voice cannot be changed during the session
       * once the model has responded with audio at least once. Current voice options are
       * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`,
       * and `cedar`. We recommend `marin` and `cedar` for best quality.
       */
      voice?:
        | (string & {})
        | 'alloy'
        | 'ash'
        | 'ballad'
        | 'coral'
        | 'echo'
        | 'sage'
        | 'shimmer'
        | 'verse'
        | 'marin'
        | 'cedar';
    }
  }

  /**
   * Give the model access to additional tools via remote Model Context Protocol
   * (MCP) servers.
   * [Learn more about MCP](https://platform.openai.com/docs/guides/tools-remote-mcp).
   */
  export interface McpTool {
    /**
     * A label for this MCP server, used to identify it in tool calls.
     */
    server_label: string;

    /**
     * The type of the MCP tool. Always `mcp`.
     */
    type: 'mcp';

    /**
     * List of allowed tool names or a filter object.
     */
    allowed_tools?: Array<string> | McpTool.McpToolFilter | null;

    /**
     * An OAuth access token that can be used with a remote MCP server, either with a
     * custom MCP server URL or a service connector. Your application must handle the
     * OAuth authorization flow and provide the token here.
     */
    authorization?: string;

    /**
     * Identifier for service connectors, like those available in ChatGPT. One of
     * `server_url` or `connector_id` must be provided. Learn more about service
     * connectors
     * [here](https://platform.openai.com/docs/guides/tools-remote-mcp#connectors).
     *
     * Currently supported `connector_id` values are:
     *
     * - Dropbox: `connector_dropbox`
     * - Gmail: `connector_gmail`
     * - Google Calendar: `connector_googlecalendar`
     * - Google Drive: `connector_googledrive`
     * - Microsoft Teams: `connector_microsoftteams`
     * - Outlook Calendar: `connector_outlookcalendar`
     * - Outlook Email: `connector_outlookemail`
     * - SharePoint: `connector_sharepoint`
     */
    connector_id?:
      | 'connector_dropbox'
      | 'connector_gmail'
      | 'connector_googlecalendar'
      | 'connector_googledrive'
      | 'connector_microsoftteams'
      | 'connector_outlookcalendar'
      | 'connector_outlookemail'
      | 'connector_sharepoint';

    /**
     * Optional HTTP headers to send to the MCP server. Use for authentication or other
     * purposes.
     */
    headers?: { [key: string]: string } | null;

    /**
     * Specify which of the MCP server's tools require approval.
     */
    require_approval?: McpTool.McpToolApprovalFilter | 'always' | 'never' | null;

    /**
     * Optional description of the MCP server, used to provide more context.
     */
    server_description?: string;

    /**
     * The URL for the MCP server. One of `server_url` or `connector_id` must be
     * provided.
     */
    server_url?: string;
  }

  export namespace McpTool {
    /**
     * A filter object to specify which tools are allowed.
     */
    export interface McpToolFilter {
      /**
       * Indicates whether or not a tool modifies data or is read-only. If an MCP server
       * is
       * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
       * it will match this filter.
       */
      read_only?: boolean;

      /**
       * List of allowed tool names.
       */
      tool_names?: Array<string>;
    }

    /**
     * Specify which of the MCP server's tools require approval. Can be `always`,
     * `never`, or a filter object associated with tools that require approval.
     */
    export interface McpToolApprovalFilter {
      /**
       * A filter object to specify which tools are allowed.
       */
      always?: McpToolApprovalFilter.Always;

      /**
       * A filter object to specify which tools are allowed.
       */
      never?: McpToolApprovalFilter.Never;
    }

    export namespace McpToolApprovalFilter {
      /**
       * A filter object to specify which tools are allowed.
       */
      export interface Always {
        /**
         * Indicates whether or not a tool modifies data or is read-only. If an MCP server
         * is
         * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
         * it will match this filter.
         */
        read_only?: boolean;

        /**
         * List of allowed tool names.
         */
        tool_names?: Array<string>;
      }

      /**
       * A filter object to specify which tools are allowed.
       */
      export interface Never {
        /**
         * Indicates whether or not a tool modifies data or is read-only. If an MCP server
         * is
         * [annotated with `readOnlyHint`](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations-readonlyhint),
         * it will match this filter.
         */
        read_only?: boolean;

        /**
         * List of allowed tool names.
         */
        tool_names?: Array<string>;
      }
    }
  }

  /**
   * Granular configuration for tracing.
   */
  export interface TracingConfiguration {
    /**
     * The group id to attach to this trace to enable filtering and grouping in the
     * Traces Dashboard.
     */
    group_id?: string;

    /**
     * The arbitrary metadata to attach to this trace to enable filtering in the Traces
     * Dashboard.
     */
    metadata?: unknown;

    /**
     * The name of the workflow to attach to this trace. This is used to name the trace
     * in the Traces Dashboard.
     */
    workflow_name?: string;
  }
}

/**
 * A Realtime transcription session configuration object.
 */
export interface RealtimeTranscriptionSessionCreateResponse {
  /**
   * Unique identifier for the session that looks like `sess_1234567890abcdef`.
   */
  id: string;

  /**
   * The object type. Always `realtime.transcription_session`.
   */
  object: string;

  /**
   * The type of session. Always `transcription` for transcription sessions.
   */
  type: 'transcription';

  /**
   * Configuration for input audio for the session.
   */
  audio?: RealtimeTranscriptionSessionCreateResponse.Audio;

  /**
   * Expiration timestamp for the session, in seconds since epoch.
   */
  expires_at?: number;

  /**
   * Additional fields to include in server outputs.
   *
   * - `item.input_audio_transcription.logprobs`: Include logprobs for input audio
   *   transcription.
   */
  include?: Array<'item.input_audio_transcription.logprobs'>;
}

export namespace RealtimeTranscriptionSessionCreateResponse {
  /**
   * Configuration for input audio for the session.
   */
  export interface Audio {
    input?: Audio.Input;
  }

  export namespace Audio {
    export interface Input {
      /**
       * The PCM audio format. Only a 24kHz sample rate is supported.
       */
      format?: RealtimeAPI.RealtimeAudioFormats;

      /**
       * Configuration for input audio noise reduction.
       */
      noise_reduction?: Input.NoiseReduction;

      /**
       * Configuration of the transcription model.
       */
      transcription?: RealtimeAPI.AudioTranscription;

      /**
       * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
       * means that the model will detect the start and end of speech based on audio
       * volume and respond at the end of user speech.
       */
      turn_detection?: ClientSecretsAPI.RealtimeTranscriptionSessionTurnDetection;
    }

    export namespace Input {
      /**
       * Configuration for input audio noise reduction.
       */
      export interface NoiseReduction {
        /**
         * Type of noise reduction. `near_field` is for close-talking microphones such as
         * headphones, `far_field` is for far-field microphones such as laptop or
         * conference room microphones.
         */
        type?: RealtimeAPI.NoiseReductionType;
      }
    }
  }
}

/**
 * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
 * means that the model will detect the start and end of speech based on audio
 * volume and respond at the end of user speech.
 */
export interface RealtimeTranscriptionSessionTurnDetection {
  /**
   * Amount of audio to include before the VAD detected speech (in milliseconds).
   * Defaults to 300ms.
   */
  prefix_padding_ms?: number;

  /**
   * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
   * With shorter values the model will respond more quickly, but may jump in on
   * short pauses from the user.
   */
  silence_duration_ms?: number;

  /**
   * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
   * threshold will require louder audio to activate the model, and thus might
   * perform better in noisy environments.
   */
  threshold?: number;

  /**
   * Type of turn detection, only `server_vad` is currently supported.
   */
  type?: string;
}

/**
 * Response from creating a session and client secret for the Realtime API.
 */
export interface ClientSecretCreateResponse {
  /**
   * Expiration timestamp for the client secret, in seconds since epoch.
   */
  expires_at: number;

  /**
   * The session configuration for either a realtime or transcription session.
   */
  session: RealtimeSessionCreateResponse | RealtimeTranscriptionSessionCreateResponse;

  /**
   * The generated client secret value.
   */
  value: string;
}

export interface ClientSecretCreateParams {
  /**
   * Configuration for the client secret expiration. Expiration refers to the time
   * after which a client secret will no longer be valid for creating sessions. The
   * session itself may continue after that time once started. A secret can be used
   * to create multiple sessions until it expires.
   */
  expires_after?: ClientSecretCreateParams.ExpiresAfter;

  /**
   * Session configuration to use for the client secret. Choose either a realtime
   * session or a transcription session.
   */
  session?: RealtimeAPI.RealtimeSessionCreateRequest | RealtimeAPI.RealtimeTranscriptionSessionCreateRequest;
}

export namespace ClientSecretCreateParams {
  /**
   * Configuration for the client secret expiration. Expiration refers to the time
   * after which a client secret will no longer be valid for creating sessions. The
   * session itself may continue after that time once started. A secret can be used
   * to create multiple sessions until it expires.
   */
  export interface ExpiresAfter {
    /**
     * The anchor point for the client secret expiration, meaning that `seconds` will
     * be added to the `created_at` time of the client secret to produce an expiration
     * timestamp. Only `created_at` is currently supported.
     */
    anchor?: 'created_at';

    /**
     * The number of seconds from the anchor point to the expiration. Select a value
     * between `10` and `7200` (2 hours). This default to 600 seconds (10 minutes) if
     * not specified.
     */
    seconds?: number;
  }
}

export declare namespace ClientSecrets {
  export {
    type RealtimeSessionClientSecret as RealtimeSessionClientSecret,
    type RealtimeSessionCreateResponse as RealtimeSessionCreateResponse,
    type RealtimeTranscriptionSessionCreateResponse as RealtimeTranscriptionSessionCreateResponse,
    type RealtimeTranscriptionSessionTurnDetection as RealtimeTranscriptionSessionTurnDetection,
    type ClientSecretCreateResponse as ClientSecretCreateResponse,
    type ClientSecretCreateParams as ClientSecretCreateParams,
  };
}
