{"version":3,"file":"textTokenization.mjs","sources":["../../../../../src/scene/text/canvas/utils/textTokenization.ts"],"sourcesContent":["import type { TextStyleWhiteSpace } from '../../TextStyle';\n\n/**\n * Cache of new line character codes.\n * @internal\n */\nexport const NEWLINES: number[] = [\n    0x000A, // line feed\n    0x000D, // carriage return\n];\n\n/**\n * Set of new line character codes for fast lookup.\n * @internal\n */\nexport const NEWLINES_SET = new Set(NEWLINES);\n\n/**\n * Cache of breaking space character codes.\n * @internal\n */\nexport const BREAKING_SPACES: number[] = [\n    0x0009, // character tabulation\n    0x0020, // space\n    0x2000, // en quad\n    0x2001, // em quad\n    0x2002, // en space\n    0x2003, // em space\n    0x2004, // three-per-em space\n    0x2005, // four-per-em space\n    0x2006, // six-per-em space\n    0x2008, // punctuation space\n    0x2009, // thin space\n    0x200A, // hair space\n    0x205F, // medium mathematical space\n    0x3000, // ideographic space\n];\n\n/**\n * Set of breaking space character codes for fast lookup.\n * @internal\n */\nexport const BREAKING_SPACES_SET = new Set(BREAKING_SPACES);\n\n/**\n * Cache of CSS-collapsible whitespace character codes.\n * Per CSS Text Module Level 3, only regular spaces and tabs are collapsible.\n * Unicode spaces (em space, en space, etc.) are NOT collapsible.\n * @internal\n */\nexport const COLLAPSIBLE_SPACES: number[] = [\n    0x0009, // character tabulation (tab)\n    0x0020, // space\n];\n\n/**\n * Set of collapsible space character codes for fast lookup.\n * @internal\n */\nexport const COLLAPSIBLE_SPACES_SET = new Set(COLLAPSIBLE_SPACES);\n\n/**\n * Characters that allow a line break AFTER them (they stay with the preceding word).\n * @internal\n */\nexport const BREAK_AFTER_CHARS: number[] = [\n    0x002D, // hyphen-minus\n    0x2010, // unicode hyphen\n    0x2013, // en-dash\n    0x2014, // em-dash\n    0x00AD, // soft hyphen\n];\n\n/**\n * Set of break-after character codes for fast lookup.\n * @internal\n */\nexport const BREAK_AFTER_CHARS_SET = new Set(BREAK_AFTER_CHARS);\n\n/**\n * Regex to split text while capturing newline sequences.\n * @internal\n */\nexport const NEWLINE_SPLIT_REGEX = /(\\r\\n|\\r|\\n)/;\n\n/**\n * Regex to split text on newlines without capturing.\n * @internal\n */\nexport const NEWLINE_MATCH_REGEX = /(?:\\r\\n|\\r|\\n)/;\n\n/**\n * Determines if char is a newline.\n * @param char - The character\n * @returns True if newline, False otherwise.\n * @internal\n */\nexport function isNewline(char: string): boolean\n{\n    if (typeof char !== 'string')\n    {\n        return false;\n    }\n\n    return NEWLINES_SET.has(char.charCodeAt(0));\n}\n\n/**\n * Determines if char is a breaking whitespace.\n *\n * It allows one to determine whether char should be a breaking whitespace\n * For example certain characters in CJK langs or numbers.\n * It must return a boolean.\n * @param char - The character\n * @param _nextChar - The next character (unused, for override compatibility)\n * @returns True if whitespace, False otherwise.\n * @internal\n */\nexport function isBreakingSpace(char: string, _nextChar?: string): boolean\n{\n    if (typeof char !== 'string')\n    {\n        return false;\n    }\n\n    return BREAKING_SPACES_SET.has(char.charCodeAt(0));\n}\n\n/**\n * Determines if char is a CSS-collapsible whitespace character.\n * Only regular space (U+0020) and tab (U+0009) are collapsible per CSS spec.\n * Unicode spaces like em space are NOT collapsible.\n * @param char - The character\n * @returns True if collapsible, False otherwise.\n * @internal\n */\nexport function isCollapsibleSpace(char: string): boolean\n{\n    if (typeof char !== 'string')\n    {\n        return false;\n    }\n\n    return COLLAPSIBLE_SPACES_SET.has(char.charCodeAt(0));\n}\n\n/**\n * Determines if char is a break-after character (line can break after it, but\n * the character stays with the preceding word).\n * @param char - The character\n * @returns True if break-after, False otherwise.\n * @internal\n */\nexport function isBreakAfterChar(char: string): boolean\n{\n    if (typeof char !== 'string')\n    {\n        return false;\n    }\n\n    return BREAK_AFTER_CHARS_SET.has(char.charCodeAt(0));\n}\n\n/**\n * Determines whether we should collapse breaking spaces.\n * @param whiteSpace - The TextStyle property whiteSpace\n * @returns Should collapse\n * @internal\n */\nexport function collapseSpaces(whiteSpace: TextStyleWhiteSpace): boolean\n{\n    return (whiteSpace === 'normal' || whiteSpace === 'pre-line');\n}\n\n/**\n * Determines whether we should collapse newLine chars.\n * @param whiteSpace - The white space\n * @returns should collapse\n * @internal\n */\nexport function collapseNewlines(whiteSpace: TextStyleWhiteSpace): boolean\n{\n    return (whiteSpace === 'normal');\n}\n\n/**\n * Trims breaking whitespaces from the right side of a string.\n * @param text - The text\n * @returns Trimmed string\n * @internal\n */\nexport function trimRight(text: string): string\n{\n    if (typeof text !== 'string')\n    {\n        return '';\n    }\n\n    let i = text.length - 1;\n\n    while (i >= 0 && isBreakingSpace(text[i]))\n    {\n        i--;\n    }\n\n    // Only slice if we found trailing spaces\n    return i < text.length - 1 ? text.slice(0, i + 1) : text;\n}\n\n/**\n * Splits a string into words, breaking-spaces and newLine characters\n * @param text - The text\n * @returns A tokenized array\n * @internal\n */\nexport function tokenize(text: string): string[]\n{\n    const tokens: string[] = [];\n    const tokenChars: string[] = [];\n\n    if (typeof text !== 'string')\n    {\n        return tokens;\n    }\n\n    for (let i = 0; i < text.length; i++)\n    {\n        const char = text[i];\n        const nextChar = text[i + 1];\n\n        if (isBreakingSpace(char, nextChar) || isNewline(char))\n        {\n            if (tokenChars.length > 0)\n            {\n                tokens.push(tokenChars.join(''));\n                tokenChars.length = 0;\n            }\n\n            // treat \\r\\n as a single new line token\n            if (char === '\\r' && nextChar === '\\n')\n            {\n                tokens.push('\\r\\n');\n                i++;\n            }\n            else\n            {\n                tokens.push(char);\n            }\n\n            continue;\n        }\n\n        tokenChars.push(char);\n\n        // Break-after chars stay with the word, then emit the token\n        if (isBreakAfterChar(char) && nextChar && !isBreakingSpace(nextChar) && !isNewline(nextChar))\n        {\n            tokens.push(tokenChars.join(''));\n            tokenChars.length = 0;\n        }\n    }\n\n    if (tokenChars.length > 0)\n    {\n        tokens.push(tokenChars.join(''));\n    }\n\n    return tokens;\n}\n\n/**\n * Splits a token into character groups that should not be broken apart.\n * Adjacent characters that can't be broken are combined into single groups.\n * @param token - The token to split\n * @param breakWords - Whether word breaking is enabled\n * @param splitFn - Function to split token into characters (default: grapheme segmenter)\n * @param canBreakCharsFn - Function to check if chars can be broken\n * @returns Array of character groups\n * @internal\n */\nexport function getCharacterGroups(\n    token: string,\n    breakWords: boolean,\n    splitFn: (s: string) => string[],\n    canBreakCharsFn: (char: string, nextChar: string, token: string, index: number, breakWords: boolean) => boolean,\n): string[]\n{\n    const characters = splitFn(token);\n    const groups: string[] = [];\n\n    for (let j = 0; j < characters.length; j++)\n    {\n        let char = characters[j];\n        let lastChar = char;\n\n        // Combine chars that shouldn't be split\n        let k = 1;\n\n        while (characters[j + k])\n        {\n            const nextChar = characters[j + k];\n\n            if (!canBreakCharsFn(lastChar, nextChar, token, j, breakWords))\n            {\n                char += nextChar;\n                lastChar = nextChar;\n                k++;\n            }\n            else\n            {\n                break;\n            }\n        }\n        j += k - 1;\n        groups.push(char);\n    }\n\n    return groups;\n}\n"],"names":[],"mappings":";AAMO,MAAM,QAAA,GAAqB;AAAA,EAC9B,EAAA;AAAA;AAAA,EACA;AAAA;AACJ;AAMO,MAAM,YAAA,GAAe,IAAI,GAAA,CAAI,QAAQ;AAMrC,MAAM,eAAA,GAA4B;AAAA,EACrC,CAAA;AAAA;AAAA,EACA,EAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA;AAAA;AACJ;AAMO,MAAM,mBAAA,GAAsB,IAAI,GAAA,CAAI,eAAe;AAQnD,MAAM,kBAAA,GAA+B;AAAA,EACxC,CAAA;AAAA;AAAA,EACA;AAAA;AACJ;AAMO,MAAM,sBAAA,GAAyB,IAAI,GAAA,CAAI,kBAAkB;AAMzD,MAAM,iBAAA,GAA8B;AAAA,EACvC,EAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA,IAAA;AAAA;AAAA,EACA;AAAA;AACJ;AAMO,MAAM,qBAAA,GAAwB,IAAI,GAAA,CAAI,iBAAiB;AAMvD,MAAM,mBAAA,GAAsB;AAM5B,MAAM,mBAAA,GAAsB;AAQ5B,SAAS,UAAU,IAAA,EAC1B;AACI,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,KAAA;AAAA,EACX;AAEA,EAAA,OAAO,YAAA,CAAa,GAAA,CAAI,IAAA,CAAK,UAAA,CAAW,CAAC,CAAC,CAAA;AAC9C;AAaO,SAAS,eAAA,CAAgB,MAAc,SAAA,EAC9C;AACI,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,KAAA;AAAA,EACX;AAEA,EAAA,OAAO,mBAAA,CAAoB,GAAA,CAAI,IAAA,CAAK,UAAA,CAAW,CAAC,CAAC,CAAA;AACrD;AAUO,SAAS,mBAAmB,IAAA,EACnC;AACI,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,KAAA;AAAA,EACX;AAEA,EAAA,OAAO,sBAAA,CAAuB,GAAA,CAAI,IAAA,CAAK,UAAA,CAAW,CAAC,CAAC,CAAA;AACxD;AASO,SAAS,iBAAiB,IAAA,EACjC;AACI,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,KAAA;AAAA,EACX;AAEA,EAAA,OAAO,qBAAA,CAAsB,GAAA,CAAI,IAAA,CAAK,UAAA,CAAW,CAAC,CAAC,CAAA;AACvD;AAQO,SAAS,eAAe,UAAA,EAC/B;AACI,EAAA,OAAQ,UAAA,KAAe,YAAY,UAAA,KAAe,UAAA;AACtD;AAQO,SAAS,iBAAiB,UAAA,EACjC;AACI,EAAA,OAAQ,UAAA,KAAe,QAAA;AAC3B;AAQO,SAAS,UAAU,IAAA,EAC1B;AACI,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,EAAA;AAAA,EACX;AAEA,EAAA,IAAI,CAAA,GAAI,KAAK,MAAA,GAAS,CAAA;AAEtB,EAAA,OAAO,KAAK,CAAA,IAAK,eAAA,CAAgB,IAAA,CAAK,CAAC,CAAC,CAAA,EACxC;AACI,IAAA,CAAA,EAAA;AAAA,EACJ;AAGA,EAAA,OAAO,CAAA,GAAI,KAAK,MAAA,GAAS,CAAA,GAAI,KAAK,KAAA,CAAM,CAAA,EAAG,CAAA,GAAI,CAAC,CAAA,GAAI,IAAA;AACxD;AAQO,SAAS,SAAS,IAAA,EACzB;AACI,EAAA,MAAM,SAAmB,EAAC;AAC1B,EAAA,MAAM,aAAuB,EAAC;AAE9B,EAAA,IAAI,OAAO,SAAS,QAAA,EACpB;AACI,IAAA,OAAO,MAAA;AAAA,EACX;AAEA,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,IAAA,CAAK,QAAQ,CAAA,EAAA,EACjC;AACI,IAAA,MAAM,IAAA,GAAO,KAAK,CAAC,CAAA;AACnB,IAAA,MAAM,QAAA,GAAW,IAAA,CAAK,CAAA,GAAI,CAAC,CAAA;AAE3B,IAAA,IAAI,gBAAgB,IAAA,EAAM,QAAQ,CAAA,IAAK,SAAA,CAAU,IAAI,CAAA,EACrD;AACI,MAAA,IAAI,UAAA,CAAW,SAAS,CAAA,EACxB;AACI,QAAA,MAAA,CAAO,IAAA,CAAK,UAAA,CAAW,IAAA,CAAK,EAAE,CAAC,CAAA;AAC/B,QAAA,UAAA,CAAW,MAAA,GAAS,CAAA;AAAA,MACxB;AAGA,MAAA,IAAI,IAAA,KAAS,IAAA,IAAQ,QAAA,KAAa,IAAA,EAClC;AACI,QAAA,MAAA,CAAO,KAAK,MAAM,CAAA;AAClB,QAAA,CAAA,EAAA;AAAA,MACJ,CAAA,MAEA;AACI,QAAA,MAAA,CAAO,KAAK,IAAI,CAAA;AAAA,MACpB;AAEA,MAAA;AAAA,IACJ;AAEA,IAAA,UAAA,CAAW,KAAK,IAAI,CAAA;AAGpB,IAAA,IAAI,gBAAA,CAAiB,IAAI,CAAA,IAAK,QAAA,IAAY,CAAC,eAAA,CAAgB,QAAQ,CAAA,IAAK,CAAC,SAAA,CAAU,QAAQ,CAAA,EAC3F;AACI,MAAA,MAAA,CAAO,IAAA,CAAK,UAAA,CAAW,IAAA,CAAK,EAAE,CAAC,CAAA;AAC/B,MAAA,UAAA,CAAW,MAAA,GAAS,CAAA;AAAA,IACxB;AAAA,EACJ;AAEA,EAAA,IAAI,UAAA,CAAW,SAAS,CAAA,EACxB;AACI,IAAA,MAAA,CAAO,IAAA,CAAK,UAAA,CAAW,IAAA,CAAK,EAAE,CAAC,CAAA;AAAA,EACnC;AAEA,EAAA,OAAO,MAAA;AACX;AAYO,SAAS,kBAAA,CACZ,KAAA,EACA,UAAA,EACA,OAAA,EACA,eAAA,EAEJ;AACI,EAAA,MAAM,UAAA,GAAa,QAAQ,KAAK,CAAA;AAChC,EAAA,MAAM,SAAmB,EAAC;AAE1B,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,UAAA,CAAW,QAAQ,CAAA,EAAA,EACvC;AACI,IAAA,IAAI,IAAA,GAAO,WAAW,CAAC,CAAA;AACvB,IAAA,IAAI,QAAA,GAAW,IAAA;AAGf,IAAA,IAAI,CAAA,GAAI,CAAA;AAER,IAAA,OAAO,UAAA,CAAW,CAAA,GAAI,CAAC,CAAA,EACvB;AACI,MAAA,MAAM,QAAA,GAAW,UAAA,CAAW,CAAA,GAAI,CAAC,CAAA;AAEjC,MAAA,IAAI,CAAC,eAAA,CAAgB,QAAA,EAAU,UAAU,KAAA,EAAO,CAAA,EAAG,UAAU,CAAA,EAC7D;AACI,QAAA,IAAA,IAAQ,QAAA;AACR,QAAA,QAAA,GAAW,QAAA;AACX,QAAA,CAAA,EAAA;AAAA,MACJ,CAAA,MAEA;AACI,QAAA;AAAA,MACJ;AAAA,IACJ;AACA,IAAA,CAAA,IAAK,CAAA,GAAI,CAAA;AACT,IAAA,MAAA,CAAO,KAAK,IAAI,CAAA;AAAA,EACpB;AAEA,EAAA,OAAO,MAAA;AACX;;;;"}