{"version":3,"file":"youtube.cjs","names":["BaseDocumentLoader","Innertube","Document"],"sources":["../../../src/document_loaders/web/youtube.ts"],"sourcesContent":["import { Innertube } from \"youtubei.js\";\nimport { Document } from \"@langchain/core/documents\";\nimport { BaseDocumentLoader } from \"@langchain/core/document_loaders/base\";\n\n/**\n * Configuration options for the YoutubeLoader class. Includes properties\n * such as the videoId, language, and addVideoInfo.\n */\ninterface YoutubeConfig {\n  videoId: string;\n  language?: string;\n  addVideoInfo?: boolean;\n}\n\n/**\n * Metadata of a YouTube video. Includes properties such as the source\n * (videoId), description, title, view_count, author, and category.\n */\ninterface VideoMetadata {\n  source: string;\n  description?: string;\n  title?: string;\n  view_count?: number;\n  author?: string;\n  category?: string;\n}\n\n/**\n * A document loader for loading data from YouTube videos. It uses the\n * youtubei.js library to fetch the transcript and video metadata.\n * @example\n * ```typescript\n * const loader = new YoutubeLoader(\n *   \"https:\n *   \"en\",\n *   true,\n * );\n * const docs = await loader.load();\n * ```\n */\nexport class YoutubeLoader extends BaseDocumentLoader {\n  private videoId: string;\n\n  private language?: string;\n\n  private addVideoInfo: boolean;\n\n  constructor(config: YoutubeConfig) {\n    super();\n    this.videoId = config.videoId;\n    this.language = config?.language;\n    this.addVideoInfo = config?.addVideoInfo ?? false;\n  }\n\n  /**\n   * Extracts the videoId from a YouTube video URL.\n   * @param url The URL of the YouTube video.\n   * @returns The videoId of the YouTube video.\n   */\n  private static getVideoID(url: string): string {\n    // YouTube video IDs are exactly 11 characters: alphanumeric, underscores, and hyphens\n    // Using a bounded pattern to avoid ReDoS vulnerabilities\n    const match = url.match(\n      /(?:youtu\\.be\\/|youtube\\.com\\/(?:v\\/|u\\/\\w\\/|embed\\/|watch\\?v=|shorts\\/))([a-zA-Z0-9_-]{11})(?:[?&#]|$)/\n    );\n    if (match !== null) {\n      return match[1];\n    } else {\n      throw new Error(\"Failed to get youtube video id from the url\");\n    }\n  }\n\n  /**\n   * Creates a new instance of the YoutubeLoader class from a YouTube video\n   * URL.\n   * @param url The URL of the YouTube video.\n   * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.\n   * @returns A new instance of the YoutubeLoader class.\n   */\n  static createFromUrl(\n    url: string,\n    config?: Omit<YoutubeConfig, \"videoId\">\n  ): YoutubeLoader {\n    const videoId = YoutubeLoader.getVideoID(url);\n    return new YoutubeLoader({ ...config, videoId });\n  }\n\n  /**\n   * Loads the transcript and video metadata from the specified YouTube\n   * video. It uses the youtubei.js library to fetch the video metadata and transcripts.\n   * @returns An array of Documents representing the retrieved data.\n   */\n  async load(): Promise<Document[]> {\n    let transcript: string | undefined;\n    const metadata: VideoMetadata = {\n      source: this.videoId,\n    };\n    try {\n      const youtube = await Innertube.create({\n        lang: this.language,\n        retrieve_player: false,\n      });\n      const info = await youtube.getInfo(this.videoId);\n      const transcriptData = await info.getTranscript();\n      transcript =\n        transcriptData.transcript.content?.body?.initial_segments\n          .map((segment) => segment.snippet.text)\n          .join(\" \") ?? \"\";\n      if (transcript === undefined) {\n        throw new Error(\"Transcription not found\");\n      }\n      if (this.addVideoInfo) {\n        const basicInfo = info.basic_info;\n        metadata.description = basicInfo.short_description;\n        metadata.title = basicInfo.title;\n        metadata.view_count = basicInfo.view_count;\n        metadata.author = basicInfo.author;\n      }\n    } catch (e: unknown) {\n      throw new Error(\n        `Failed to get YouTube video transcription: ${(e as Error).message}`\n      );\n    }\n    const document = new Document({\n      pageContent: transcript,\n      metadata,\n    });\n\n    return [document];\n  }\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAwCA,IAAa,gBAAb,MAAa,sBAAsBA,sCAAAA,mBAAmB;CACpD;CAEA;CAEA;CAEA,YAAY,QAAuB;AACjC,SAAO;AACP,OAAK,UAAU,OAAO;AACtB,OAAK,WAAW,QAAQ;AACxB,OAAK,eAAe,QAAQ,gBAAgB;;;;;;;CAQ9C,OAAe,WAAW,KAAqB;EAG7C,MAAM,QAAQ,IAAI,MAChB,yGACD;AACD,MAAI,UAAU,KACZ,QAAO,MAAM;MAEb,OAAM,IAAI,MAAM,8CAA8C;;;;;;;;;CAWlE,OAAO,cACL,KACA,QACe;EACf,MAAM,UAAU,cAAc,WAAW,IAAI;AAC7C,SAAO,IAAI,cAAc;GAAE,GAAG;GAAQ;GAAS,CAAC;;;;;;;CAQlD,MAAM,OAA4B;EAChC,IAAI;EACJ,MAAM,WAA0B,EAC9B,QAAQ,KAAK,SACd;AACD,MAAI;GAKF,MAAM,OAAO,OAJG,MAAMC,YAAAA,UAAU,OAAO;IACrC,MAAM,KAAK;IACX,iBAAiB;IAClB,CAAC,EACyB,QAAQ,KAAK,QAAQ;AAEhD,iBADuB,MAAM,KAAK,eAAe,EAEhC,WAAW,SAAS,MAAM,iBACtC,KAAK,YAAY,QAAQ,QAAQ,KAAK,CACtC,KAAK,IAAI,IAAI;AAClB,OAAI,eAAe,KAAA,EACjB,OAAM,IAAI,MAAM,0BAA0B;AAE5C,OAAI,KAAK,cAAc;IACrB,MAAM,YAAY,KAAK;AACvB,aAAS,cAAc,UAAU;AACjC,aAAS,QAAQ,UAAU;AAC3B,aAAS,aAAa,UAAU;AAChC,aAAS,SAAS,UAAU;;WAEvB,GAAY;AACnB,SAAM,IAAI,MACR,8CAA+C,EAAY,UAC5D;;AAOH,SAAO,CALU,IAAIC,0BAAAA,SAAS;GAC5B,aAAa;GACb;GACD,CAAC,CAEe"}