/******************************************************************************
 * Copyright 2022 TypeFox GmbH
 * This program and the accompanying materials are made available under the
 * terms of the MIT License, which is available in the project root.
 ******************************************************************************/

import { type Grammar, GrammarAST, GrammarUtils, RegExpUtils } from 'langium';
import { type Generated, expandToNode, joinToNode, toString } from 'langium/generate';
import type { LangiumLanguageConfig } from '../../package-types.js';
import { collectKeywords } from '../langium-util.js';

/**
 * Monarch Language Definition, describes aspects & token categories of target language
 */
interface LanguageDefinition {
    readonly name: string;
    readonly keywords: string[];
    readonly operators: string[];
    readonly symbols: string[];
    readonly tokenPostfix: string;
}

/**
 * Monarch Tokenizer, consists of an object that defines states.
 */
interface Tokenizer {
    states: State[]
}

/**
 * Name of a State
 */
type StateName = string;

/**
 * Each state is defined as an array of rules which are used to match the input
 * Rules can be regular, or other States whose rules we should include in this state
 */
interface State {
    name: StateName
    rules: Array<Rule | State>
}

/**
 * A rule that matches input. Can have either an action, or an array of cases.
 */
interface Rule {
    regex: RegExp | string;
    action: Action | Case[];
}

/**
 * A case that selects a specific action by matching a guard pattern
 */
interface Case {
    guard: string;
    action: Action;
}

/**
 * Determines whether a given object is a Rule instance
 * @param obj Object to check
 * @returns Whether this object is a Rule
 */
function isRule(obj: State | Rule): obj is Rule {
    return (obj as Rule).regex !== undefined && (obj as Rule).action !== undefined;
}

/**
 * Name of a token type, such as 'string'
 */
type Token = string;

/**
 * Token class to be used for CSS rendering, such as 'keyword', 'component', or 'type.identifer'
 */
type TokenClass = string;

/**
 * Next state that proceeds from an action, can also be a pop or a push of the current state (like for nested block comments)
 */
type NextState = StateName | '@pop' | '@push';

/**
 * An action performed when a rule (or a case) matches token.
 * It can determine the token class, as well whether to push/pop a tokenizer state
 */
interface Action {
    token?: Token
    tokenClass?: TokenClass
    next?: NextState
    // other more advanced states omitted...
}

/**
 * Abstract representation of a Monarch grammar file
 */
interface MonarchGrammar {
    readonly languageDefinition: LanguageDefinition;
    readonly tokenizer: Tokenizer;
}

/**
 * Generates a Monarch highlighting grammar file's contents, based on the passed Langium grammar
 * @param grammar Langium grammar to use in generating this Monarch syntax highlighting file content
 * @param config Langium Config to also use during generation
 * @returns Generated Monarch syntax highlighting file content
 */
export function generateMonarch(grammar: Grammar, config: LangiumLanguageConfig): string {

    const symbols = getSymbols(grammar);
    const regex = /[{}[\]()]/;
    const operators = symbols.filter(s => !regex.test(s));

    // build absract monarch grammar representation
    const monarchGrammar: MonarchGrammar = {
        languageDefinition: {
            name: config.id,  // identifier for generating the grammar export
            keywords: getKeywords(grammar),
            operators,
            symbols,
            tokenPostfix: '.' + config.id, // category appended to all tokens
        },
        tokenizer: {
            states: getTokenizerStates(grammar)
        }
    };

    // return concrete monarch grammar representation
    return prettyPrint(monarchGrammar);
}

/**
 * Gets Monarch tokenizer states from a Langium grammar
 * @param grammar Langium grammar to source tokenizer states from
 * @returns Array of tokenizer states
 */
function getTokenizerStates(grammar: Grammar): State[] {

    // initial state, name is arbitrary, just needs to come first
    const initialState: State = {
        name: 'initial',
        rules: getTerminalRules(grammar)
    };

    const whitespaceState: State = {
        name: 'whitespace',
        rules: getWhitespaceRules(grammar)
    };

    const commentState: State = {
        name: 'comment',
        rules: getCommentRules(grammar)
    };

    // order the following additional rules, to prevent
    // comment sequences being classified as symbols

    // add include for the whitespace state
    initialState.rules.push(whitespaceState);

    // add operator & symbol case handling
    initialState.rules.push({
        regex: '@symbols',
        action: [
            {
                guard: '@operators',
                action: { token: 'operator' }
            },
            // by default, leave the symbol alone
            {
                guard: '@default',
                action: { token: '' }
            }
        ]
    });

    return [
        initialState,
        whitespaceState,
        commentState
    ];
}

/**
 * Pretty prints a monarch grammar into a concrete form, suitable for writing to a file
 * @param monarchGrammar Grammar to pretty print
 * @returns Monarch grammar in concrete form
 */
function prettyPrint(monarchGrammar: MonarchGrammar): string {
    const name = monarchGrammar.languageDefinition.name;
    const node = expandToNode`
        // Monarch syntax highlighting for the ${name} language.
        export default {
            ${prettyPrintLangDef(monarchGrammar.languageDefinition)}

            ${prettyPrintTokenizer(monarchGrammar.tokenizer)}
        };
    `.appendNewLine();

    return toString(node);
}

/**
 * Generates an entry for a language definition, given a name (token category) and values
 * @param name Category of language definition to add
 * @param values Values to add under the given category
 * @returns GeneratorNode containing this printed language definition entry
 */
function genLanguageDefEntry(name: string, values: string[]): Generated {
    return expandToNode`
        ${name}: [
            ${ values.map(v => `'${v}'`).join(',') }
        ],
    `;
}

/**
 * Pretty prints the language definition portion of a Monarch grammar
 * @param languageDef LanguageDefinition to pretty print
 * @param node Existing generator node to append printed language definition to
 */
function prettyPrintLangDef(languageDef: LanguageDefinition): Generated {
    return expandToNode`
        ${genLanguageDefEntry('keywords', languageDef.keywords)}
        ${genLanguageDefEntry('operators', languageDef.operators)}
        ${/* special case, identify symbols via singular regex*/ undefined}
        symbols: ${new RegExp(languageDef.symbols.map(RegExpUtils.escapeRegExp).join('|')).toString()},
    `;
}

/**
 * Pretty prints the tokenizer portion of a Monarch grammar file
 * @param tokenizer Tokenizer portion to print out
 * @param node Existing generator node to append printed tokenizer to
 */
function prettyPrintTokenizer(tokenizer: Tokenizer): Generated {
    return expandToNode`
        tokenizer: {
            ${joinToNode(tokenizer.states, prettyPrintState, { appendNewLineIfNotEmpty: true})}
        }
    `;
}

/**
 * Pretty prints a tokenizer state, composed of various rules
 * @param state Tokenizer state to pretty print
 * @param node Existing enerator node to append printed state to
 */
function prettyPrintState(state: State): Generated {
    return expandToNode`
        ${state.name}: [
            ${joinToNode(state.rules, prettyPrintRule, { appendNewLineIfNotEmpty: true })}
        ],
    `;
}

/**
 * Pretty prints a Rule.
 * This can either be a literal rule to match w/ an action, or a reference to a state to include here
 * @param ruleOrState Rule to pretty print. If it's a state, we include that state's contents implicitly within this context.
 * @returns Generator node containing this printed rule
 */
function prettyPrintRule(ruleOrState: Rule | State): Generated {
    if (isRule(ruleOrState)) {
        // extract rule pattern, either just a string or a regex w/ parts
        const rulePatt = ruleOrState.regex instanceof RegExp ? ruleOrState.regex : new RegExp(ruleOrState.regex);
        return expandToNode`{ regex: ${rulePatt.toString()}, action: ${prettyPrintAction(ruleOrState.action)} },`;
    } else {
        // include another state by name, implicitly includes all of its contents
        return expandToNode`{ include: '@${ruleOrState.name}' },`;
    }
}

/**
 * Pretty prints the action of a Rule
 * @param action Action to print. Can have several keywords to control what the state machine should do next.
 * @returns Action in concrete form
 */
function prettyPrintAction(action: Action | Case[]): string {
    if (!Array.isArray(action)) {
        // plain action
        return JSON.stringify(action);
    } else {
        // array of cases, each with an action
        const prettyCases: string = action.map(c => `'${c.guard}': ` + prettyPrintAction(c.action)).join(', ');
        return '{ cases: { ' + prettyCases + ' }}';
    }
}

/**
 * Extracts Monarch token name from a Langium terminal rule, using either name or type.
 * @param rule Rule to convert to a Monarch token name
 * @returns Returns the equivalent monarch token name, or the original rule name
 */
function getMonarchTokenName(rule: GrammarAST.TerminalRule): string {
    if (rule.name.toLowerCase() === 'string') {
        // string is clarified as a terminal by name, but not necessarily by type
        return 'string';
    } else if (rule.type) {
        // use rule type
        return rule.type.name;
    } else {
        // fallback to the original name
        return rule.name;
    }
}

/**
 * Gets whitespace rules from the langium grammar. Includes starting comment sequence.
 * @param grammar Langium grammar to extract whitespace rules from
 * @returns Array of Monarch whitespace rules
 */
function getWhitespaceRules(grammar: Grammar): Rule[] {
    const rules: Rule[] = [];
    for (const rule of grammar.rules) {
        if (GrammarAST.isTerminalRule(rule)) {
            const regex = GrammarUtils.terminalRegex(rule);

            if (!GrammarUtils.isCommentTerminal(rule) && !RegExpUtils.isWhitespace(regex)) {
                // skip rules that are not comments or whitespace
                continue;
            }

            // token name is either comment or whitespace
            const tokenName = GrammarUtils.isCommentTerminal(rule) ? 'comment' : 'white';

            const part = RegExpUtils.getTerminalParts(regex)[0];

            // check if this is a comment terminal w/ a start & end sequence (multi-line)
            if (part && part.start !== '' && part.end !== '' && GrammarUtils.isCommentTerminal(rule)) {
                // state-based comment rule, only add push to jump into it
                rules.push({
                    regex: part.start,
                    action: { token: tokenName, next: '@' + tokenName }
                });

            } else {
                // single regex rule, generally for whitespace
                rules.push({
                    regex: regex,
                    action: { token: tokenName }
                });
            }
        }
    }
    return rules;
}

/**
 * Gets comment state rules from the Langium grammar.
 * Accounts for multi-line comments, but without nesting.
 * @param grammar Langium grammar to extract comment rules from
 * @returns Array of Monarch comment rules
 */
function getCommentRules(grammar: Grammar): Rule[] {
    const rules: Rule[] = [];
    for (const rule of grammar.rules) {
        if (GrammarAST.isTerminalRule(rule) && GrammarUtils.isCommentTerminal(rule)) {
            const tokenName = 'comment';
            const part = RegExpUtils.getTerminalParts(GrammarUtils.terminalRegex(rule))[0];
            if (part && part.start !== '' && part.end !== '') {
                // rules to manage comment start/end
                // rule order matters

                const start = part.start;
                const end = part.end;

                // 1st, add anything that's not in the start sequence
                rules.push({
                    regex: `[^${start}]+`,
                    action: { token: tokenName }
                });

                // 2nd, end of sequence, pop this state, keeping others on the stack
                rules.push({
                    regex: end,
                    action: { token: tokenName, next: '@pop' }
                });

                // 3rd, otherwise, start sequence characters are OK in this state
                rules.push({
                    regex: `[${start}]`,
                    action: { token: tokenName }
                });

            }
        }
    }
    return rules;
}

/**
 * Retrieves non-comment terminal rules, creating associated actions for them
 * @param grammar Grammar to get non-comment terminals from
 * @returns Array of Rules to add to a Monarch tokenizer state
 */
function getTerminalRules(grammar: Grammar): Rule[] {
    const rules: Rule[] = [];
    for (const rule of grammar.rules) {
        if (GrammarAST.isTerminalRule(rule) && !GrammarUtils.isCommentTerminal(rule)) {
            const regex = GrammarUtils.terminalRegex(rule);

            if (RegExpUtils.isWhitespace(regex)) {
                // disallow terminal rules that match whitespace
                continue;
            }

            const tokenName = getMonarchTokenName(rule);
            // default action...
            let action: Action | Case[] = { token: tokenName };

            if (getKeywords(grammar).some(keyword => regex.test(keyword))) {
                // this rule overlaps with at least one keyword
                // add case so keywords aren't tagged incorrectly as this token type
                action = [{
                    guard: '@keywords',
                    action: { token: 'keyword' }
                },
                {
                    guard: '@default',
                    action // include default action from above
                }];
            }

            rules.push({
                regex,
                action
            });
        }
    }
    return rules;
}

/**
 * Keyword regex for matching keyword terminals, or for only collecting symbol terminals
 */
const KeywordRegex = /[A-Za-z]/;

/**
 * Retrieves keywords from the current grammar
 * @param grammar Grammar to get keywords from
 * @returns Array of keywords
 */
function getKeywords(grammar: Grammar): string[] {
    return collectKeywords(grammar).filter(kw => KeywordRegex.test(kw));
}

/**
 * Retrieve symbols from langium grammar
 * @param grammar Grammar to get symbols from
 * @returns Array of symbols, effective inverse of getKeywords
 */
function getSymbols(grammar: Grammar): string[] {
    return collectKeywords(grammar).filter(kw => !KeywordRegex.test(kw));
}
