{"version":3,"sources":["../src/Trie.ts","../src/tokenizer.ts"],"sourcesContent":["/**\r\n * Trie in TypeScript. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass\r\n * Loose reference https://en.wikipedia.org/wiki/Trie\r\n */\r\nexport class Trie {\r\n  private data: Record<string, any>;\r\n  private _tokens: Set<string>;\r\n\r\n  constructor() {\r\n    this.data = {};\r\n    this._tokens = new Set();\r\n  }\r\n\r\n  /**\r\n   * Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.\r\n   * The special key `\"\"` is used to represent termination.\r\n   *\r\n   * This function is idempotent, adding twice the same word will leave the trie unchanged\r\n   *\r\n   * Example:\r\n   *\r\n   * ```typescript\r\n   * const trie = new Trie();\r\n   * trie.add(\"Hello 友達\");\r\n   * console.log(trie.data);\r\n   * // {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\r\n   *\r\n   * trie.add(\"Hello\");\r\n   * console.log(trie.data);\r\n   * // {\"H\": {\"e\": {\"l\": {\"l\": {\"o\": {\"\": 1, \" \": {\"友\": {\"達\": {\"\": 1}}}}}}}}}\r\n   * ```\r\n   */\r\n  add(word: string): void {\r\n    if (!word) {\r\n      // Prevent empty string\r\n      return;\r\n    }\r\n\r\n    this._tokens.add(word);\r\n    let ref = this.data;\r\n    for (const char of word) {\r\n      ref[char] = char in ref ? ref[char] : {};\r\n      ref = ref[char];\r\n    }\r\n    ref[\"\"] = 1;\r\n  }\r\n\r\n  /**\r\n   * Will look for the words added to the trie within `text`. Output is the original string splitted along the\r\n   * boundaries of the words found.\r\n   *\r\n   * This trie will match the longest possible word first !\r\n   *\r\n   * Example:\r\n   *\r\n   * ```typescript\r\n   * const trie = new Trie();\r\n   * console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n   * // [\"[CLS] This is a extra_id_100\"]\r\n   *\r\n   * trie.add(\"[CLS]\");\r\n   * trie.add(\"extra_id_1\");\r\n   * trie.add(\"extra_id_100\");\r\n   * console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n   * // [\"[CLS]\", \" This is a \", \"extra_id_100\"]\r\n   * ```\r\n   */\r\n  split(text: string): string[] {\r\n    if (!text) {\r\n      return [];\r\n    }\r\n    let states: Record<number, any> = {};\r\n\r\n    // indexes are counted left of the chars index.\r\n    // \"hello\", index 0, is left of h, index 1 is between h and e.\r\n    // index 5 is right of the \"o\".\r\n\r\n    // States are going to capture every possible start (indexes as above)\r\n    // as keys, and have as values, a pointer to the position in the trie\r\n    // where we're at. This is a partial match for now.\r\n    // This enables to keep track of multiple matches while we're iterating\r\n    // the string\r\n    // If the trie contains, \"blowing\", and \"lower\" and we encounter the\r\n    // string \"blower\", we need to split into [\"b\", \"lower\"].\r\n    // This is where we need to keep track of multiple possible starts.\r\n    const offsets: number[] = [0];\r\n\r\n    // This is used by the lookahead which needs to skip over\r\n    // some text where the full match exceeded the place in the initial\r\n    // for loop\r\n    let skip = 0;\r\n\r\n    // Main loop, Giving this algorithm O(n) complexity\r\n    for (let current = 0; current < text.length; current++) {\r\n      if (skip && current < skip) {\r\n        // Prevents the lookahead for matching twice\r\n        // like extra_id_100 and id_100\r\n        continue;\r\n      }\r\n\r\n      // This will track every state\r\n      // that stop matching, we need to stop tracking them.\r\n      // If we look at \"lowball\", we're going to match \"l\" (add it to states), \"o\", \"w\", then\r\n      // fail on \"b\", we need to remove 0 from the valid states.\r\n      let toRemove: Set<number> = new Set();\r\n      // Whenever we found a match, we need to drop everything\r\n      // this is a greedy algorithm, it will match on the first found token\r\n      let reset = false;\r\n\r\n      // In this case, we already have partial matches (But unfinished)\r\n      for (let start in states) {\r\n        let triePointer = states[start];\r\n        if (\"\" in triePointer) {\r\n          // This is a final match, we need to reset and\r\n          // store the results in `offsets`.\r\n\r\n          // Lookahead to match longest first\r\n          // Important in case of extra_id_1 vs extra_id_100\r\n          // Here we are also actively looking for other earlier partial\r\n          // matches\r\n          // \"[CLS]\", \"L\", we need to match CLS even if L is special\r\n          let lookaheadIndex: number;\r\n          let end: number;\r\n          let nextChar: string | null;\r\n\r\n          for (const lookStart in states) {\r\n            let lookTriePointer = states[lookStart];\r\n            if (parseInt(lookStart) > parseInt(start)) {\r\n              // This partial match is later, we can stop looking\r\n              break;\r\n            } else if (parseInt(lookStart) < parseInt(start)) {\r\n              // This partial match is earlier, the trie pointer\r\n              // was already updated, so index is + 1\r\n              lookaheadIndex = current + 1;\r\n              end = current + 1;\r\n            } else {\r\n              // Here lookstart == start and\r\n              //      looktrie_pointer == trie_pointer\r\n              // It wasn't updated yet so indices are current ones\r\n              lookaheadIndex = current;\r\n              end = current;\r\n            }\r\n            nextChar =\r\n              lookaheadIndex < text.length ? text[lookaheadIndex] : null;\r\n            if (\"\" in lookTriePointer) {\r\n              start = lookStart;\r\n              end = lookaheadIndex;\r\n              skip = lookaheadIndex;\r\n            }\r\n            while (nextChar && nextChar in lookTriePointer) {\r\n              lookTriePointer = lookTriePointer[nextChar];\r\n              lookaheadIndex += 1;\r\n              if (\"\" in lookTriePointer) {\r\n                start = lookStart;\r\n                end = lookaheadIndex;\r\n                skip = lookaheadIndex;\r\n              }\r\n\r\n              if (lookaheadIndex === text.length) {\r\n                // End of string\r\n                break;\r\n              }\r\n              nextChar = text[lookaheadIndex];\r\n            }\r\n            // End lookahead\r\n          }\r\n\r\n          // Storing and resetting\r\n          offsets.push(parseInt(start));\r\n          offsets.push(end!);\r\n          reset = true;\r\n          break;\r\n        } else if (text[current] in triePointer) {\r\n          // The current character being looked at has a match within the trie\r\n          // update the pointer (it will be stored back into states later).\r\n          triePointer = triePointer[text[current]];\r\n\r\n          // Storing back the new pointer into the states.\r\n          // Partial matches got longer by one.\r\n          states[start] = triePointer;\r\n        } else {\r\n          // The new character has not match in the trie, we need\r\n          // to stop keeping track of this partial match.\r\n          // We can't do it directly within the loop because of how\r\n          // TypeScript iteration works\r\n          toRemove.add(parseInt(start));\r\n        }\r\n      }\r\n\r\n      // Either clearing the full start (we found a real match)\r\n      // Or clearing only the partial matches that didn't work.\r\n      if (reset) {\r\n        states = {};\r\n      } else {\r\n        for (const start of toRemove) {\r\n          delete states[start];\r\n        }\r\n      }\r\n\r\n      // If this character is a starting character within the trie\r\n      // start keeping track of this partial match.\r\n      if (current >= skip && text[current] in this.data) {\r\n        states[current] = this.data[text[current]];\r\n      }\r\n    }\r\n\r\n    // We have a cut at the end with states.\r\n    for (const start in states) {\r\n      const triePointer = states[start];\r\n      if (\"\" in triePointer) {\r\n        // This is a final match, we need to reset and\r\n        // store the results in `offsets`.\r\n        const end = text.length;\r\n        offsets.push(parseInt(start));\r\n        offsets.push(end);\r\n        // Longest cut is always the one with lower start so the first\r\n        // item so we need to break.\r\n        break;\r\n      }\r\n    }\r\n\r\n    return this.cutText(text, offsets);\r\n  }\r\n\r\n  protected cutText(text: string, offsets: number[]): string[] {\r\n    // We have all the offsets now, we just need to do the actual splitting.\r\n    // We need to eventually add the first part of the string and the eventual\r\n    // last part.\r\n    offsets.push(text.length);\r\n    const tokens: string[] = [];\r\n    let start = 0;\r\n    for (const end of offsets) {\r\n      if (start > end) {\r\n        console.error(\r\n          \"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway.\"\r\n        );\r\n        continue;\r\n      } else if (start === end) {\r\n        // This might happen if there's a match at index 0\r\n        // we're also preventing zero-width cuts in case of two\r\n        // consecutive matches\r\n        continue;\r\n      }\r\n      tokens.push(text.slice(start, end));\r\n      start = end;\r\n    }\r\n\r\n    return tokens;\r\n  }\r\n}\r\n\r\n// test case\r\n// const main = () => {\r\n//   const trie = new Trie();\r\n//   console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n//   // [\"[CLS] This is a extra_id_100\"]\r\n\r\n//   trie.add(\"[CLS]\");\r\n//   trie.add(\"extra_id_1\");\r\n//   trie.add(\"extra_id_100\");\r\n//   console.log(trie.split(\"[CLS] This is a extra_id_100\"));\r\n//   // [\"[CLS]\", \" This is a \", \"extra_id_100\"]\r\n// };\r\n\r\n// main();\r\n","import { Trie } from \"./Trie\";\r\n\r\nconst utf8Encoder = new TextEncoder();\r\nconst utf8Decoder = new TextDecoder(\"utf-8\");\r\n\r\nconst byteToHex = (byte: number) =>\r\n  byte.toString(16).padStart(2, \"0\").toUpperCase();\r\n\r\nexport class Llama2Tokenizer {\r\n  protected tokens_trie = new Trie();\r\n  protected special_tokens: Record<string, number> = {};\r\n\r\n  protected vocab: Record<string, number> = {};\r\n  protected vocab_ids: Record<number, string> = {};\r\n\r\n  constructor() {}\r\n\r\n  /**\r\n   * Install the provided vocabulary into the class instance.\r\n   *\r\n   * @param {Record<string, number>} vocab - The vocabulary to be installed\r\n   */\r\n  install_vocab(vocab: Record<string, number>) {\r\n    this.vocab = vocab;\r\n    this.vocab_ids = Object.fromEntries(\r\n      Object.entries(vocab).map(([token, id]) => [id, token])\r\n    );\r\n    this.tokens_trie = new Trie();\r\n    for (const [token, id] of Object.entries(vocab)) {\r\n      this.tokens_trie.add(token);\r\n    }\r\n  }\r\n\r\n  /**\r\n   * Get the size of the vocabulary, including special tokens.\r\n   *\r\n   * @return {number} the size of the vocabulary\r\n   */\r\n  get vocab_size(): number {\r\n    return (\r\n      Object.keys(this.vocab).length + Object.keys(this.special_tokens).length\r\n    );\r\n  }\r\n\r\n  /**\r\n   * Get the maximum id from the vocab_ids and special_tokens.\r\n   *\r\n   * @return {number} the maximum id\r\n   */\r\n  get max_id(): number {\r\n    // NOTE: vocab 最大可能超过 js 函数参数个数最大范围，所以不能 `Math.max(...Object.keys(this.vocab_ids))`\r\n    let max_id = 0;\r\n    for (const id of Object.keys(this.vocab_ids)) {\r\n      max_id = Math.max(max_id, parseInt(id));\r\n    }\r\n    for (const id of Object.values(this.special_tokens)) {\r\n      max_id = Math.max(max_id, id);\r\n    }\r\n    return max_id;\r\n  }\r\n\r\n  /**\r\n   * Adds a special token with an optional token ID.\r\n   *\r\n   * @param {string} token - the special token to be added\r\n   * @param {number} [token_id] - the optional token ID\r\n   * @return {void}\r\n   */\r\n  add_special_token(token: string, token_id?: number) {\r\n    if (token_id === undefined) {\r\n      token_id = this.max_id + 1;\r\n    }\r\n    this.special_tokens[token] = token_id;\r\n    this.tokens_trie.add(token);\r\n  }\r\n\r\n  /**\r\n   * Adds special tokens to the list of tokens.\r\n   *\r\n   * @param {Array} tokens - An array of tokens to add. Each token can be a string or an object with `token` and `token_id` properties.\r\n   */\r\n  add_special_tokens(\r\n    tokens: (\r\n      | string\r\n      | {\r\n          token: string;\r\n          token_id: number;\r\n        }\r\n    )[]\r\n  ) {\r\n    for (const token of tokens) {\r\n      if (typeof token === \"string\") {\r\n        this.add_special_token(token);\r\n      } else {\r\n        this.add_special_token(token.token, token.token_id);\r\n      }\r\n    }\r\n  }\r\n\r\n  /**\r\n   * Convert an id to a token.\r\n   *\r\n   * @param {number} id - The id to be converted to a token.\r\n   * @return {string} The corresponding token for the given id.\r\n   */\r\n  ids_to_token(id: number): string {\r\n    const token = this.vocab_ids[id];\r\n    const special_token = Object.entries(this.special_tokens).find(\r\n      ([_, token_id]) => token_id === id\r\n    );\r\n    if (token) {\r\n      return token;\r\n    } else if (special_token) {\r\n      return special_token[0];\r\n    } else {\r\n      throw new Error(`Unknown id: ${id}`);\r\n    }\r\n  }\r\n  /**\r\n   * token_to_id function takes a token as input and returns its corresponding id if found in the vocabulary, otherwise throws an error.\r\n   *\r\n   * @param {string} token - the input token\r\n   * @return {number} the corresponding id of the input token\r\n   */\r\n  token_to_id(token: string): number {\r\n    const id = this.vocab[token];\r\n    const special_token = this.special_tokens[token];\r\n    if (id !== undefined) {\r\n      return id;\r\n    } else if (special_token !== undefined) {\r\n      return special_token;\r\n    } else {\r\n      throw new Error(`Unknown token: ${token}`);\r\n    }\r\n  }\r\n\r\n  /**\r\n   * Retrieve the vocabulary.\r\n   *\r\n   * @return {Object} a shallow copy of the vocabulary\r\n   */\r\n  get_vocab() {\r\n    return { ...this.vocab, ...this.special_tokens };\r\n  }\r\n\r\n  /**\r\n   * Checks if the token is a valid token.\r\n   *\r\n   * @param {string} token - the token to be checked\r\n   * @return {boolean} true if the token is valid, false otherwise\r\n   */\r\n  valid_token(token: string): boolean {\r\n    return token in this.vocab || token in this.special_tokens;\r\n  }\r\n\r\n  /**\r\n   * Converts a string in a sequence of tokens, using the tokenizer.\r\n   */\r\n  tokenize(text: string): string[] {\r\n    const tokens = this.tokens_trie.split(text);\r\n\r\n    const result = [] as string[];\r\n    for (const token of tokens) {\r\n      if (this.valid_token(token)) {\r\n        result.push(token);\r\n      } else {\r\n        // convert unknown unicode to <0xXX>\r\n        // TODO: use a better way to handle unknown unicode (某些vocab不支持unknown unicode可能需要<unk>代替)\r\n        const bytes = utf8Encoder.encode(token);\r\n        for (const byte of bytes) {\r\n          result.push(`<0x${byteToHex(byte)}>`);\r\n        }\r\n      }\r\n    }\r\n\r\n    return result;\r\n  }\r\n\r\n  /**\r\n   * Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.\r\n   */\r\n  encode(text: string): number[] {\r\n    return this.convert_tokens_to_ids(this.tokenize(text));\r\n  }\r\n\r\n  /**\r\n   * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.\r\n   */\r\n  decode(ids: number[]): string {\r\n    return this.convert_tokens_to_string(this.convert_ids_to_tokens(ids));\r\n  }\r\n\r\n  /**\r\n   * Converts a sequence of tokens (string) in a single string.\r\n   */\r\n  convert_tokens_to_string(tokens: string[]): string {\r\n    for (const token of tokens) {\r\n      if (!this.valid_token(token)) {\r\n        throw new Error(`Unknown token: ${token}`);\r\n      }\r\n    }\r\n    const chars = [] as string[];\r\n\r\n    let index = 0;\r\n    while (index < tokens.length) {\r\n      let token = tokens[index];\r\n      index += 1;\r\n      if (!token.startsWith(\"<0x\")) {\r\n        chars.push(token);\r\n        continue;\r\n      }\r\n      const bytes = [] as number[];\r\n      while (token && token.startsWith(\"<0x\")) {\r\n        bytes.push(parseInt(token.slice(3, 5), 16));\r\n        token = tokens[index];\r\n        index += 1;\r\n      }\r\n      chars.push(utf8Decoder.decode(new Uint8Array(bytes)));\r\n    }\r\n\r\n    return chars.join(\"\");\r\n  }\r\n\r\n  /**\r\n   * Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.\r\n   */\r\n  convert_tokens_to_ids(tokens: string[]): number[] {\r\n    let result: number[] = [];\r\n    for (const token of tokens) {\r\n      const id = this.token_to_id(token);\r\n      result.push(id);\r\n    }\r\n    return result;\r\n  }\r\n\r\n  /**\r\n   * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.\r\n   */\r\n  convert_ids_to_tokens(ids: number[]): string[] {\r\n    return ids.map((id) => {\r\n      const token = this.ids_to_token(id);\r\n      return token;\r\n    });\r\n  }\r\n}\r\n\r\n// test\r\n// const main = async () => {\r\n//   const tokenizer = new Llama2Tokenizer();\r\n//   tokenizer.load_llama2_vocab();\r\n//   console.log(tokenizer.tokenize(\"你好，世界！\"));\r\n//   // [\"你\", \"好\", \"，\", \"世\", \"界\", \"！\"]\r\n// };\r\n// main();\r\n"],"mappings":"+EAIO,IAAMA,EAAN,KAAW,CAJlB,MAIkB,CAAAC,EAAA,aACR,KACA,QAER,aAAc,CACZ,KAAK,KAAO,CAAC,EACb,KAAK,QAAU,IAAI,GACrB,CAqBA,IAAIC,EAAoB,CACtB,GAAI,CAACA,EAEH,OAGF,KAAK,QAAQ,IAAIA,CAAI,EACrB,IAAIC,EAAM,KAAK,KACf,QAAWC,KAAQF,EACjBC,EAAIC,CAAI,EAAIA,KAAQD,EAAMA,EAAIC,CAAI,EAAI,CAAC,EACvCD,EAAMA,EAAIC,CAAI,EAEhBD,EAAI,EAAE,EAAI,CACZ,CAsBA,MAAME,EAAwB,CAC5B,GAAI,CAACA,EACH,MAAO,CAAC,EAEV,IAAIC,EAA8B,CAAC,EAc7BC,EAAoB,CAAC,CAAC,EAKxBC,EAAO,EAGX,QAASC,EAAU,EAAGA,EAAUJ,EAAK,OAAQI,IAAW,CACtD,GAAID,GAAQC,EAAUD,EAGpB,SAOF,IAAIE,EAAwB,IAAI,IAG5BC,EAAQ,GAGZ,QAASC,KAASN,EAAQ,CACxB,IAAIO,EAAcP,EAAOM,CAAK,EAC9B,GAAI,KAAMC,EAAa,CASrB,IAAIC,EACAC,EACAC,EAEJ,QAAWC,KAAaX,EAAQ,CAC9B,IAAIY,EAAkBZ,EAAOW,CAAS,EACtC,GAAI,SAASA,CAAS,EAAI,SAASL,CAAK,EAEtC,MAoBF,IAnBW,SAASK,CAAS,EAAI,SAASL,CAAK,GAG7CE,EAAiBL,EAAU,EAC3BM,EAAMN,EAAU,IAKhBK,EAAiBL,EACjBM,EAAMN,GAERO,EACEF,EAAiBT,EAAK,OAASA,EAAKS,CAAc,EAAI,MACpD,KAAMI,KACRN,EAAQK,EACRF,EAAMD,EACNN,EAAOM,GAEFE,GAAYA,KAAYE,IAC7BA,EAAkBA,EAAgBF,CAAQ,EAC1CF,GAAkB,EACd,KAAMI,IACRN,EAAQK,EACRF,EAAMD,EACNN,EAAOM,GAGLA,IAAmBT,EAAK,SAI5BW,EAAWX,EAAKS,CAAc,CAGlC,CAGAP,EAAQ,KAAK,SAASK,CAAK,CAAC,EAC5BL,EAAQ,KAAKQ,CAAI,EACjBJ,EAAQ,GACR,KACF,MAAWN,EAAKI,CAAO,IAAKI,GAG1BA,EAAcA,EAAYR,EAAKI,CAAO,CAAC,EAIvCH,EAAOM,CAAK,EAAIC,GAMhBH,EAAS,IAAI,SAASE,CAAK,CAAC,CAEhC,CAIA,GAAID,EACFL,EAAS,CAAC,MAEV,SAAWM,KAASF,EAClB,OAAOJ,EAAOM,CAAK,EAMnBH,GAAWD,GAAQH,EAAKI,CAAO,IAAK,KAAK,OAC3CH,EAAOG,CAAO,EAAI,KAAK,KAAKJ,EAAKI,CAAO,CAAC,EAE7C,CAGA,QAAWG,KAASN,EAElB,GAAI,KADgBA,EAAOM,CAAK,EACT,CAGrB,IAAMG,EAAMV,EAAK,OACjBE,EAAQ,KAAK,SAASK,CAAK,CAAC,EAC5BL,EAAQ,KAAKQ,CAAG,EAGhB,KACF,CAGF,OAAO,KAAK,QAAQV,EAAME,CAAO,CACnC,CAEU,QAAQF,EAAcE,EAA6B,CAI3DA,EAAQ,KAAKF,EAAK,MAAM,EACxB,IAAMc,EAAmB,CAAC,EACtBP,EAAQ,EACZ,QAAWG,KAAOR,EAAS,CACzB,GAAIK,EAAQG,EAAK,CACf,QAAQ,MACN,oGACF,EACA,QACF,SAAWH,IAAUG,EAInB,SAEFI,EAAO,KAAKd,EAAK,MAAMO,EAAOG,CAAG,CAAC,EAClCH,EAAQG,CACV,CAEA,OAAOI,CACT,CACF,ECvPA,IAAMC,EAAc,IAAI,YAClBC,EAAc,IAAI,YAAY,OAAO,EAErCC,EAAYC,EAACC,GACjBA,EAAK,SAAS,EAAE,EAAE,SAAS,EAAG,GAAG,EAAE,YAAY,EAD/B,aAGLC,EAAN,KAAsB,CAR7B,MAQ6B,CAAAF,EAAA,wBACjB,YAAc,IAAIG,EAClB,eAAyC,CAAC,EAE1C,MAAgC,CAAC,EACjC,UAAoC,CAAC,EAE/C,aAAc,CAAC,CAOf,cAAcC,EAA+B,CAC3C,KAAK,MAAQA,EACb,KAAK,UAAY,OAAO,YACtB,OAAO,QAAQA,CAAK,EAAE,IAAI,CAAC,CAACC,EAAOC,CAAE,IAAM,CAACA,EAAID,CAAK,CAAC,CACxD,EACA,KAAK,YAAc,IAAIF,EACvB,OAAW,CAACE,EAAOC,CAAE,IAAK,OAAO,QAAQF,CAAK,EAC5C,KAAK,YAAY,IAAIC,CAAK,CAE9B,CAOA,IAAI,YAAqB,CACvB,OACE,OAAO,KAAK,KAAK,KAAK,EAAE,OAAS,OAAO,KAAK,KAAK,cAAc,EAAE,MAEtE,CAOA,IAAI,QAAiB,CAEnB,IAAIE,EAAS,EACb,QAAWD,KAAM,OAAO,KAAK,KAAK,SAAS,EACzCC,EAAS,KAAK,IAAIA,EAAQ,SAASD,CAAE,CAAC,EAExC,QAAWA,KAAM,OAAO,OAAO,KAAK,cAAc,EAChDC,EAAS,KAAK,IAAIA,EAAQD,CAAE,EAE9B,OAAOC,CACT,CASA,kBAAkBF,EAAeG,EAAmB,CAC9CA,IAAa,SACfA,EAAW,KAAK,OAAS,GAE3B,KAAK,eAAeH,CAAK,EAAIG,EAC7B,KAAK,YAAY,IAAIH,CAAK,CAC5B,CAOA,mBACEI,EAOA,CACA,QAAWJ,KAASI,EACd,OAAOJ,GAAU,SACnB,KAAK,kBAAkBA,CAAK,EAE5B,KAAK,kBAAkBA,EAAM,MAAOA,EAAM,QAAQ,CAGxD,CAQA,aAAaC,EAAoB,CAC/B,IAAMD,EAAQ,KAAK,UAAUC,CAAE,EACzBI,EAAgB,OAAO,QAAQ,KAAK,cAAc,EAAE,KACxD,CAAC,CAACC,EAAGH,CAAQ,IAAMA,IAAaF,CAClC,EACA,GAAID,EACF,OAAOA,EACF,GAAIK,EACT,OAAOA,EAAc,CAAC,EAEtB,MAAM,IAAI,MAAM,eAAeJ,CAAE,EAAE,CAEvC,CAOA,YAAYD,EAAuB,CACjC,IAAMC,EAAK,KAAK,MAAMD,CAAK,EACrBK,EAAgB,KAAK,eAAeL,CAAK,EAC/C,GAAIC,IAAO,OACT,OAAOA,EACF,GAAII,IAAkB,OAC3B,OAAOA,EAEP,MAAM,IAAI,MAAM,kBAAkBL,CAAK,EAAE,CAE7C,CAOA,WAAY,CACV,MAAO,CAAE,GAAG,KAAK,MAAO,GAAG,KAAK,cAAe,CACjD,CAQA,YAAYA,EAAwB,CAClC,OAAOA,KAAS,KAAK,OAASA,KAAS,KAAK,cAC9C,CAKA,SAASO,EAAwB,CAC/B,IAAMH,EAAS,KAAK,YAAY,MAAMG,CAAI,EAEpCC,EAAS,CAAC,EAChB,QAAWR,KAASI,EAClB,GAAI,KAAK,YAAYJ,CAAK,EACxBQ,EAAO,KAAKR,CAAK,MACZ,CAGL,IAAMS,EAAQjB,EAAY,OAAOQ,CAAK,EACtC,QAAWJ,KAAQa,EACjBD,EAAO,KAAK,MAAMd,EAAUE,CAAI,CAAC,GAAG,CAExC,CAGF,OAAOY,CACT,CAKA,OAAOD,EAAwB,CAC7B,OAAO,KAAK,sBAAsB,KAAK,SAASA,CAAI,CAAC,CACvD,CAKA,OAAOG,EAAuB,CAC5B,OAAO,KAAK,yBAAyB,KAAK,sBAAsBA,CAAG,CAAC,CACtE,CAKA,yBAAyBN,EAA0B,CACjD,QAAWJ,KAASI,EAClB,GAAI,CAAC,KAAK,YAAYJ,CAAK,EACzB,MAAM,IAAI,MAAM,kBAAkBA,CAAK,EAAE,EAG7C,IAAMW,EAAQ,CAAC,EAEXC,EAAQ,EACZ,KAAOA,EAAQR,EAAO,QAAQ,CAC5B,IAAIJ,EAAQI,EAAOQ,CAAK,EAExB,GADAA,GAAS,EACL,CAACZ,EAAM,WAAW,KAAK,EAAG,CAC5BW,EAAM,KAAKX,CAAK,EAChB,QACF,CACA,IAAMS,EAAQ,CAAC,EACf,KAAOT,GAASA,EAAM,WAAW,KAAK,GACpCS,EAAM,KAAK,SAAST,EAAM,MAAM,EAAG,CAAC,EAAG,EAAE,CAAC,EAC1CA,EAAQI,EAAOQ,CAAK,EACpBA,GAAS,EAEXD,EAAM,KAAKlB,EAAY,OAAO,IAAI,WAAWgB,CAAK,CAAC,CAAC,CACtD,CAEA,OAAOE,EAAM,KAAK,EAAE,CACtB,CAKA,sBAAsBP,EAA4B,CAChD,IAAII,EAAmB,CAAC,EACxB,QAAWR,KAASI,EAAQ,CAC1B,IAAMH,EAAK,KAAK,YAAYD,CAAK,EACjCQ,EAAO,KAAKP,CAAE,CAChB,CACA,OAAOO,CACT,CAKA,sBAAsBE,EAAyB,CAC7C,OAAOA,EAAI,IAAKT,GACA,KAAK,aAAaA,CAAE,CAEnC,CACH,CACF","names":["Trie","__name","word","ref","char","text","states","offsets","skip","current","toRemove","reset","start","triePointer","lookaheadIndex","end","nextChar","lookStart","lookTriePointer","tokens","utf8Encoder","utf8Decoder","byteToHex","__name","byte","Llama2Tokenizer","Trie","vocab","token","id","max_id","token_id","tokens","special_token","_","text","result","bytes","ids","chars","index"]}