{"version":3,"sources":["../../src/indexer/index.ts"],"sourcesContent":["/**\n * Copyright 2024 Bloom Labs Inc\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport { GoogleGenerativeAIEmbeddings } from '@langchain/google-genai';\nimport { glob } from 'glob';\nimport fs from 'fs';\nimport { CharacterTextSplitter } from 'langchain/text_splitter';\nimport { HNSWLib } from 'langchain/vectorstores';\nimport { TaskType } from '@google/generative-ai';\n\nimport { IndexerFlowOptions, PluginOptions } from '../interfaces';\nimport {\n  EMBEDDING_MODEL_NAME,\n  EMBEDDING_MODEL,\n  EMBEDDING_TITLE,\n} from '../constants';\n\nconst getFilesData = (files: string[]): string[] => {\n  console.log(\n    `Added ${files.length} files to data.  Splitting text into chunks...`\n  );\n  const filesData: string[] = [];\n  for (const file of files) {\n    filesData.push(fs.readFileSync(file, 'utf-8'));\n  }\n  return filesData;\n};\n\nconst getFiles = async (input: string): Promise<string[]> => {\n  try {\n    return glob(input, { ignore: 'node_modules/**' });\n  } catch (error) {\n    console.error('Error fetching files:', error);\n    throw error;\n  }\n};\n\nconst getSplitter = (\n  chunkSize: number | undefined,\n  separator: string | undefined\n) => {\n  return new CharacterTextSplitter({\n    chunkSize: chunkSize || 12720,\n    separator: separator || '\\n',\n  });\n};\n\nconst saveVectorStore = async (\n  docs: string[],\n  apiKey: string | undefined,\n  output: string\n) => {\n  console.log('Initializing Store...');\n  const store = await HNSWLib.fromTexts(\n    docs,\n    docs.map((_: any, i: any) => ({ id: i })),\n    new GoogleGenerativeAIEmbeddings({\n      apiKey: apiKey || process.env.GOOGLE_API_KEY,\n      model: EMBEDDING_MODEL,\n      modelName: EMBEDDING_MODEL_NAME,\n      taskType: TaskType.RETRIEVAL_DOCUMENT,\n      title: EMBEDDING_TITLE,\n    })\n  );\n  console.log('Saving Vectorstore');\n  await store.save(output);\n  return `VectorStore saved to ${output}`;\n};\n\nconst getVectorDocument = (\n  filesData: string[],\n  textSplitter: { splitText: (arg0: any) => any }\n) => {\n  let docs: string[] = [];\n  for (const d of filesData) {\n    const docOutput = textSplitter.splitText(d);\n    docs = [...docs, ...docOutput];\n  }\n  return docs.splice(docs.length - 4, 4);\n};\n\nconst saveVectorIndexer = async (\n  flowOptions: IndexerFlowOptions,\n  pluginOptions: PluginOptions\n) => {\n  const { dataPath, indexOutputPath, chunkSize, separator } = flowOptions;\n  const { apiKey } = pluginOptions;\n\n  const files: string[] = await getFiles(dataPath);\n  const filesData = getFilesData(files);\n  const textSplitter = getSplitter(chunkSize, separator);\n  const vectorDocument = getVectorDocument(filesData, textSplitter);\n\n  return saveVectorStore(vectorDocument, apiKey, indexOutputPath);\n};\n\nexport { saveVectorIndexer };\n"],"mappings":"AAgBA,SAAS,oCAAoC;AAC7C,SAAS,YAAY;AACrB,OAAO,QAAQ;AACf,SAAS,6BAA6B;AACtC,SAAS,eAAe;AACxB,SAAS,gBAAgB;AAGzB;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAEP,MAAM,eAAe,CAAC,UAA8B;AAClD,UAAQ;AAAA,IACN,SAAS,MAAM,MAAM;AAAA,EACvB;AACA,QAAM,YAAsB,CAAC;AAC7B,aAAW,QAAQ,OAAO;AACxB,cAAU,KAAK,GAAG,aAAa,MAAM,OAAO,CAAC;AAAA,EAC/C;AACA,SAAO;AACT;AAEA,MAAM,WAAW,OAAO,UAAqC;AAC3D,MAAI;AACF,WAAO,KAAK,OAAO,EAAE,QAAQ,kBAAkB,CAAC;AAAA,EAClD,SAAS,OAAO;AACd,YAAQ,MAAM,yBAAyB,KAAK;AAC5C,UAAM;AAAA,EACR;AACF;AAEA,MAAM,cAAc,CAClB,WACA,cACG;AACH,SAAO,IAAI,sBAAsB;AAAA,IAC/B,WAAW,aAAa;AAAA,IACxB,WAAW,aAAa;AAAA,EAC1B,CAAC;AACH;AAEA,MAAM,kBAAkB,OACtB,MACA,QACA,WACG;AACH,UAAQ,IAAI,uBAAuB;AACnC,QAAM,QAAQ,MAAM,QAAQ;AAAA,IAC1B;AAAA,IACA,KAAK,IAAI,CAAC,GAAQ,OAAY,EAAE,IAAI,EAAE,EAAE;AAAA,IACxC,IAAI,6BAA6B;AAAA,MAC/B,QAAQ,UAAU,QAAQ,IAAI;AAAA,MAC9B,OAAO;AAAA,MACP,WAAW;AAAA,MACX,UAAU,SAAS;AAAA,MACnB,OAAO;AAAA,IACT,CAAC;AAAA,EACH;AACA,UAAQ,IAAI,oBAAoB;AAChC,QAAM,MAAM,KAAK,MAAM;AACvB,SAAO,wBAAwB,MAAM;AACvC;AAEA,MAAM,oBAAoB,CACxB,WACA,iBACG;AACH,MAAI,OAAiB,CAAC;AACtB,aAAW,KAAK,WAAW;AACzB,UAAM,YAAY,aAAa,UAAU,CAAC;AAC1C,WAAO,CAAC,GAAG,MAAM,GAAG,SAAS;AAAA,EAC/B;AACA,SAAO,KAAK,OAAO,KAAK,SAAS,GAAG,CAAC;AACvC;AAEA,MAAM,oBAAoB,OACxB,aACA,kBACG;AACH,QAAM,EAAE,UAAU,iBAAiB,WAAW,UAAU,IAAI;AAC5D,QAAM,EAAE,OAAO,IAAI;AAEnB,QAAM,QAAkB,MAAM,SAAS,QAAQ;AAC/C,QAAM,YAAY,aAAa,KAAK;AACpC,QAAM,eAAe,YAAY,WAAW,SAAS;AACrD,QAAM,iBAAiB,kBAAkB,WAAW,YAAY;AAEhE,SAAO,gBAAgB,gBAAgB,QAAQ,eAAe;AAChE;","names":[]}