/* eslint-disable max-len */
import type { Grammar } from './grammar.js'

/**
 * Regular expression patterns and constants used by the lexer for tokenization.
 * These patterns identify different types of tokens in Jexl expressions.
 */

/** Matches numeric literals (integers and floats, including negative numbers) */
const numericRegex = /^-?(?:(?:[0-9]*\.[0-9]+)|[0-9]+)$/

/** Matches valid identifier names (variables, function names, etc.) */
const identRegex =
  /^[a-zA-Zа-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF$][a-zA-Zа-яА-Я0-9_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF$]*$/

/** Matches escaped backslashes in string literals */
const escEscRegex = /\\\\/

/** Matches whitespace-only strings */
const whitespaceRegex = /^\s*$/

/**
 * Regex elements that are processed before operator elements.
 * Includes string literals, whitespace, and boolean literals.
 */
const preOpRegexElements = [
  // Strings
  "'(?:(?:\\\\')|[^'])*'",
  '"(?:(?:\\\\")|[^"])*"',
  // Whitespace
  '\\s+',
  // Booleans
  '\\btrue\\b',
  '\\bfalse\\b',
  // Null
  '\\bnull\\b',
  // Undefined
  '\\bundefined\\b',
  // Numerics (without negative symbol)
  '(?:[0-9]+(?:\\.[0-9]+)?|\\.[0-9]+)',
]
/**
 * Regex elements that are processed after operator elements.
 * Includes identifiers.
 */
const postOpRegexElements = [
  // Identifiers
  '[a-zA-Zа-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\\$][a-zA-Z0-9а-яА-Я_\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\\$]*',
]
/**
 * Token types after which a minus sign should be treated as a negation operator
 * rather than a binary subtraction operator.
 */
const unaryOpsAfter = ['binaryOp', 'unaryOp', 'openParen', 'openBracket', 'question', 'colon', 'comma']

/**
 * Represents a lexical token in a Jexl expression.
 * Each token contains information about its type, processed value, and original text.
 */
interface Token {
  /** The type of token (e.g., 'literal', 'identifier', 'binaryOp') */
  type: string
  /** The processed value of the token (e.g., parsed number, unquoted string) */
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  value: any
  /** The original raw text from the expression string */
  raw: string
}

/**
 * Lexer is responsible for the lexical analysis phase of Jexl expression parsing.
 * It takes a raw expression string and converts it into a sequence of tokens that
 * can be consumed by the Parser.
 *
 * The Lexer's primary responsibilities are:
 * - Identifying and categorizing different parts of the expression (literals, operators, etc.)
 * - Converting raw text into meaningful tokens with appropriate types and values
 * - Handling special cases like negative numbers and string escaping
 * - Minimal syntax validation (only what's needed for tokenization)
 *
 * @example
 * ```typescript
 * const lexer = new Lexer(grammar)
 * const tokens = lexer.tokenize('user.name | upper')
 * // Results in tokens like:
 * // [
 * //   { type: 'identifier', value: 'user', raw: 'user' },
 * //   { type: 'dot', value: '.', raw: '.' },
 * //   { type: 'identifier', value: 'name', raw: 'name' },
 * //   { type: 'pipe', value: '|', raw: ' | ' },
 * //   { type: 'identifier', value: 'upper', raw: 'upper' }
 * // ]
 * ```
 *
 * ## Tokenization Process
 *
 * 1. **Split**: Expression is split into elements using a regex
 * 2. **Classify**: Each element is classified by type (literal, operator, etc.)
 * 3. **Process**: Values are processed (e.g., parse numbers, unquote strings)
 * 4. **Optimize**: Adjacent whitespace is consolidated with neighboring tokens
 *
 * ## Token Types
 *
 * - **literal**: String, number, or boolean values
 * - **identifier**: Variable names, function names, property names
 * - **binaryOp**: Binary operators like `+`, `-`, `==`, `&&`
 * - **unaryOp**: Unary operators like `!`, `-` (negation)
 * - **dot**: Property access operator `.`
 * - **pipe**: Transform operator `|`
 * - **openParen/closeParen**: Parentheses for grouping `()`
 * - **openBracket/closeBracket**: Brackets for array access/filtering `[]`
 * - **openCurl/closeCurl**: Braces for object literals `{}`
 * - **question/colon**: Ternary operator parts `?` and `:`
 * - **comma**: Argument separator `,`
 *
 * ## Error Handling
 *
 * The Lexer performs minimal error checking. It will throw errors only when:
 * - It encounters characters that cannot be classified into any token type
 * - String literals are malformed (unclosed quotes)
 *
 * Most syntax errors (like mismatched operators) are left for the Parser to detect.
 */
export default class Lexer {
  /** The grammar configuration containing operators and other language elements */
  private _grammar: Grammar
  /** Cached regex for splitting expressions, built on first use */
  private _splitRegex?: RegExp

  /**
   * Creates a new Lexer instance with the given grammar configuration.
   *
   * @param grammar The grammar containing operators, functions, and other language elements
   */
  constructor(grammar: Grammar) {
    this._grammar = grammar
  }

  /**
   * Splits a Jexl expression string into an array of expression elements.
   * @param str A Jexl expression string
   * @returns An array of substrings defining the functional
   *      elements of the expression.
   */
  getElements(str: string): string[] {
    const regex = this._getSplitRegex()
    return str.split(regex).filter((elem) => {
      // Remove empty strings
      return elem
    })
  }

  /**
   * Converts an array of expression elements into an array of tokens.  Note that
   * the resulting array may not equal the element array in length, as any
   * elements that consist only of whitespace get appended to the previous
   * token's "raw" property.  For the structure of a token object, please see
   * {@link Lexer#tokenize}.
   * @param elements An array of Jexl expression elements to be
   *      converted to tokens
   * @returns An array of token objects.
   */
  getTokens(elements: string[]): Token[] {
    const tokens: Token[] = []
    let negate = false
    for (let i = 0; i < elements.length; i++) {
      const element = elements[i]
      if (!element) continue // Skip empty elements

      if (this._isWhitespace(element)) {
        if (tokens.length > 0) {
          tokens[tokens.length - 1].raw += element
        }
      } else if ((element === '+' || element === '-') && this._isUnary(tokens)) {
        const lastToken = tokens.length > 0 ? tokens[tokens.length - 1] : null
        if (
          lastToken &&
          lastToken.type === 'binaryOp' &&
          (lastToken.value === '+' || lastToken.value === '-') &&
          !lastToken.raw.match(/\s$/)
        ) {
          throw new Error(`Unexpected token '${element}' after operator '${lastToken.value}'`)
        }
        let nextElement = ''
        for (let j = i + 1; j < elements.length; j++) {
          if (!this._isWhitespace(elements[j])) {
            nextElement = elements[j]
            break
          }
        }
        if (element === '-') {
          if (nextElement.match(numericRegex)) {
            negate = true
          } else {
            const token = this._createToken(element)
            token.type = 'unaryOp'
            tokens.push(token)
          }
        } else {
          // Unary plus. If it's not before a number, it's a unary op.
          // Otherwise, it's optional and we can just ignore it.
          if (!nextElement.match(numericRegex)) {
            const token = this._createToken(element)
            token.type = 'unaryOp'
            tokens.push(token)
          }
        }
      } else {
        if (negate) {
          elements[i] = '-' + element
          negate = false
        }
        tokens.push(this._createToken(elements[i]))
      }
    }
    // Catch a - at the end of the string. Let the parser handle that issue.
    if (negate) {
      tokens.push(this._createToken('-'))
    }
    return tokens
  }

  /**
   * Converts a Jexl expression string into an array of tokens.
   * This is the main entry point for lexical analysis.
   *
   * Each token is an object with the following structure:
   * ```typescript
   * {
   *   type: string,     // Token type (e.g., 'literal', 'identifier', 'binaryOp')
   *   value: any,       // Processed value (parsed number, unquoted string, etc.)
   *   raw: string       // Original text including any whitespace
   * }
   * ```
   *
   * ## Token Types
   *
   * - **literal**: String, number, or boolean values
   * - **identifier**: Variable names, function names, property names
   * - **binaryOp**: Binary operators like `+`, `-`, `==`, `&&`
   * - **unaryOp**: Unary operators like `!`, `-` (negation)
   * - **Grammar elements**: Control characters defined in grammar (dot, pipe, etc.)
   *
   * ## Value Processing
   *
   * - **Strings**: Quotes are removed and escape sequences processed
   * - **Numbers**: Converted to numeric values using `parseFloat()`
   * - **Booleans**: `"true"` and `"false"` become boolean values
   * - **Others**: Remain as original strings
   *
   * @param str The Jexl expression string to be tokenized
   * @returns An array of token objects representing the expression
   * @throws {Error} if the string contains invalid tokens
   *
   * @example
   * ```typescript
   * lexer.tokenize('user.age >= 18')
   * // Returns:
   * // [
   * //   { type: 'identifier', value: 'user', raw: 'user' },
   * //   { type: 'dot', value: '.', raw: '.' },
   * //   { type: 'identifier', value: 'age', raw: 'age' },
   * //   { type: 'binaryOp', value: '>=', raw: ' >= ' },
   * //   { type: 'literal', value: 18, raw: '18' }
   * // ]
   * ```
   */
  tokenize(str: string): Token[] {
    const elements = this.getElements(str)
    return this.getTokens(elements)
  }

  /**
   * Creates a new token object from an element of a Jexl string. See
   * {@link Lexer#tokenize} for a description of the token object.
   * @param element The element from which a token should be made
   * @returns A token object describing the provided element.
   * @throws {Error} if the provided string is not a valid expression element.
   * @private
   */
  _createToken(element: string): Token {
    const token: Token = {
      type: 'literal',
      value: element,
      raw: element,
    }
    if (element[0] === '"' || element[0] === "'") {
      token.value = this._unquote(element)
    } else if (element.match(numericRegex)) {
      token.value = parseFloat(element)
    } else if (element === 'true' || element === 'false') {
      token.value = element === 'true'
    } else if (element === 'null') {
      token.value = null
    } else if (element === 'undefined') {
      token.value = undefined
    } else if (this._grammar.elements[element]) {
      token.type = this._grammar.elements[element].type
    } else if (element.match(identRegex)) {
      token.type = 'identifier'
    } else {
      throw new Error(`Invalid expression token: ${element}`)
    }
    return token
  }

  /**
   * Escapes a string so that it can be treated as a string literal within a
   * regular expression.
   * @param str The string to be escaped
   * @returns the RegExp-escaped string.
   * @see https://developer.mozilla.org/en/docs/Web/JavaScript/Guide/Regular_Expressions
   * @private
   */
  _escapeRegExp(str: string): string {
    str = str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
    if (str.match(identRegex)) {
      str = '\\b' + str + '\\b'
    }
    return str
  }

  /**
   * Gets a RegEx object appropriate for splitting a Jexl string into its core
   * elements.
   * @returns {RegExp} An element-splitting RegExp object
   * @private
   */
  _getSplitRegex(): RegExp {
    if (!this._splitRegex) {
      // Sort by most characters to least, then regex escape each
      const elemArray = Object.keys(this._grammar.elements)
        .sort((a, b) => {
          return b.length - a.length
        })
        .map((elem) => {
          return this._escapeRegExp(elem)
        }, this)
      this._splitRegex = new RegExp(
        '(' + [preOpRegexElements.join('|'), elemArray.join('|'), postOpRegexElements.join('|')].join('|') + ')'
      )
    }
    return this._splitRegex
  }

  /**
   * Determines whether the addition of a '-' token should be interpreted as a
   * negative symbol for an upcoming number, given an array of tokens already
   * processed.
   * @param {Array<Object>} tokens An array of tokens already processed
   * @returns {boolean} true if adding a '-' should be considered a negative
   *      symbol or a '+' should be considered a positive symbol; false otherwise
   * @private
   */
  _isUnary(tokens: Token[]): boolean {
    if (!tokens.length) return true
    const lastToken = tokens[tokens.length - 1]
    if (!lastToken) return true
    return unaryOpsAfter.some((type) => type === lastToken.type)
  }

  /**
   * A utility function to determine if a string consists of only space
   * characters.
   * @param {string} str A string to be tested
   * @returns {boolean} true if the string is empty or consists of only spaces;
   *      false otherwise.
   * @private
   */
  _isWhitespace(str: string): boolean {
    return !!str.match(whitespaceRegex)
  }

  /**
   * Removes the beginning and trailing quotes from a string, unescapes any
   * escaped quotes on its interior, and unescapes any escaped escape
   * characters. Note that this function is not defensive; it assumes that the
   * provided string is not empty, and that its first and last characters are
   * actually quotes.
   * @param {string} str A string whose first and last characters are quotes
   * @returns {string} a string with the surrounding quotes stripped and escapes
   *      properly processed.
   * @private
   */
  _unquote(str: string): string {
    const quote = str[0]
    if (!quote) {
      throw new Error('Cannot unquote empty string')
    }
    const escQuoteRegex = new RegExp('\\\\' + quote, 'g')
    return str
      .substr(1, str.length - 2)
      .replace(escQuoteRegex, quote)
      .replace(escEscRegex, '\\')
  }
}
