import { describe, it, expect, beforeEach } from 'vitest';
import { MarkdownDocumentProcessor } from '../document-processor.js';
import type { RAGDocument, RAGChunk, DocumentMetadata } from '../types.js';

describe('MarkdownDocumentProcessor', () => {
  let processor: MarkdownDocumentProcessor;

  beforeEach(() => {
    processor = new MarkdownDocumentProcessor();
  });

  describe('parse', () => {
    it('should parse a simple markdown document', async () => {
      const content = `# Test Document

This is a test document with some content.

## Section 1

This is section 1 content.`;

      const doc = await processor.parse(content, '/test/doc.md');

      expect(doc.id).toMatch(/^doc-/);
      expect(doc.path).toBe('/test/doc.md');
      expect(doc.content).toBe(content);
      expect(doc.metadata.title).toBe('Test Document');
      expect(doc.metadata.size).toBe(content.length);
    });

    it('should extract frontmatter metadata', async () => {
      const content = `---
title: Custom Title
author: John Doe
tags: [tutorial, guide]
version: 1.0.0
---

# Test Document

Content here.`;

      const doc = await processor.parse(content, '/test/doc.md');

      expect(doc.metadata.title).toBe('Custom Title');
      expect(doc.metadata.author).toBe('John Doe');
      expect(doc.metadata.tags).toEqual(['tutorial', 'guide']);
      expect(doc.metadata.version).toBe('1.0.0');
    });

    it('should handle documents without headers', async () => {
      const content = `This is just a paragraph.

Another paragraph here.`;

      const doc = await processor.parse(content, '/test/doc.md');

      expect(doc.metadata.title).toBe('doc');
      expect(doc.content).toBe(content);
    });

    it('should extract code blocks', async () => {
      const content = `# Code Example

Here's some code:

\`\`\`typescript
function hello() {
  console.log('Hello, world!');
}
\`\`\`

More text here.`;

      const doc = await processor.parse(content, '/test/doc.md');

      expect(doc.content).toContain('```typescript');
      expect(doc.metadata.title).toBe('Code Example');
    });
  });

  describe('chunk', () => {
    it('should chunk by paragraphs', async () => {
      const doc: RAGDocument = {
        id: 'doc-123',
        path: '/test/doc.md',
        content: `# Title

First paragraph here.

Second paragraph here.

Third paragraph here.`,
        metadata: {
          title: 'Title',
          lastModified: new Date().toISOString(),
          size: 100
        }
      };

      const chunks = await processor.chunk(doc, 100, 10);

      expect(chunks).toHaveLength(4); // Header + 3 paragraphs
      expect(chunks[0].content).toBe('# Title');
      expect(chunks[0].metadata.type).toBe('header');
      expect(chunks[1].content).toBe('First paragraph here.');
      expect(chunks[1].metadata.type).toBe('paragraph');
    });

    it('should respect chunk size limits', async () => {
      const longParagraph = 'This is a very long paragraph. '.repeat(20);
      const doc: RAGDocument = {
        id: 'doc-456',
        path: '/test/doc.md',
        content: `# Title

${longParagraph}

Short paragraph.`,
        metadata: {
          title: 'Title',
          lastModified: new Date().toISOString(),
          size: 1000
        }
      };

      const chunks = await processor.chunk(doc, 100, 20);

      // Long paragraph should be split
      const longChunks = chunks.filter(c => c.content.includes('very long paragraph'));
      expect(longChunks.length).toBeGreaterThan(1);

      // Each chunk should respect size limit (with some tolerance for word boundaries)
      chunks.forEach(chunk => {
        expect(chunk.content.length).toBeLessThanOrEqual(120); // 100 + 20 overlap tolerance
      });
    });

    it('should handle chunk overlap', async () => {
      const doc: RAGDocument = {
        id: 'doc-789',
        path: '/test/doc.md',
        content: `First sentence. Second sentence. Third sentence. Fourth sentence.`,
        metadata: {
          title: 'Test',
          lastModified: new Date().toISOString(),
          size: 100
        }
      };

      const chunks = await processor.chunk(doc, 30, 10);

      // Check that chunks overlap
      expect(chunks.length).toBeGreaterThan(1);
      
      // Find overlapping content between consecutive chunks
      for (let i = 0; i < chunks.length - 1; i++) {
        const currentEnd = chunks[i].content.slice(-10);
        const nextStart = chunks[i + 1].content.slice(0, 10);
        
        // There should be some overlap (not exact due to word boundaries)
        const hasOverlap = chunks[i].content.includes(chunks[i + 1].content.split(' ')[0]);
        expect(hasOverlap || i === chunks.length - 2).toBe(true);
      }
    });

    it('should preserve code blocks as single chunks', async () => {
      const doc: RAGDocument = {
        id: 'doc-code',
        path: '/test/code.md',
        content: `# Code Example

Some text before.

\`\`\`typescript
function longFunction() {
  // This is a long code block
  // that should not be split
  const a = 1;
  const b = 2;
  return a + b;
}
\`\`\`

Some text after.`,
        metadata: {
          title: 'Code Example',
          lastModified: new Date().toISOString(),
          size: 300
        }
      };

      const chunks = await processor.chunk(doc, 50, 10);

      const codeChunk = chunks.find(c => c.metadata.type === 'code');
      expect(codeChunk).toBeDefined();
      expect(codeChunk!.content).toContain('function longFunction()');
      expect(codeChunk!.metadata.language).toBe('typescript');
    });

    it('should handle nested headers', async () => {
      const doc: RAGDocument = {
        id: 'doc-nested',
        path: '/test/nested.md',
        content: `# Main Title

## Section 1

Content for section 1.

### Subsection 1.1

Content for subsection.

## Section 2

Content for section 2.`,
        metadata: {
          title: 'Main Title',
          lastModified: new Date().toISOString(),
          size: 200
        }
      };

      const chunks = await processor.chunk(doc, 100, 10);

      const headers = chunks.filter(c => c.metadata.type === 'header');
      expect(headers).toHaveLength(4);
      
      expect(headers[0].metadata.level).toBe(1);
      expect(headers[1].metadata.level).toBe(2);
      expect(headers[2].metadata.level).toBe(3);
      expect(headers[3].metadata.level).toBe(2);
    });

    it('should generate unique chunk IDs', async () => {
      const doc: RAGDocument = {
        id: 'doc-unique',
        path: '/test/unique.md',
        content: `Para 1

Para 2

Para 3`,
        metadata: {
          title: 'Test',
          lastModified: new Date().toISOString(),
          size: 50
        }
      };

      const chunks = await processor.chunk(doc, 100, 10);
      const ids = chunks.map(c => c.id);
      const uniqueIds = new Set(ids);

      expect(uniqueIds.size).toBe(ids.length);
    });

    it('should maintain chunk order with indices', async () => {
      const doc: RAGDocument = {
        id: 'doc-order',
        path: '/test/order.md',
        content: `# Title

First

Second

Third`,
        metadata: {
          title: 'Title',
          lastModified: new Date().toISOString(),
          size: 50
        }
      };

      const chunks = await processor.chunk(doc, 100, 10);

      chunks.forEach((chunk, i) => {
        expect(chunk.index).toBe(i);
      });
    });
  });

  describe('extractMetadata', () => {
    it('should extract basic metadata from path', async () => {
      const content = '# Simple Doc';
      const metadata = await processor.extractMetadata(content, '/docs/guide.md');

      expect(metadata.title).toBe('Simple Doc');
      expect(metadata.size).toBe(content.length);
      expect(metadata.lastModified).toBeDefined();
    });

    it('should prioritize frontmatter metadata', async () => {
      const content = `---
title: Frontmatter Title
author: Jane Smith
tags:
  - api
  - reference
custom:
  category: technical
---

# Different Title`;

      const metadata = await processor.extractMetadata(content, '/test.md');

      expect(metadata.title).toBe('Frontmatter Title');
      expect(metadata.author).toBe('Jane Smith');
      expect(metadata.tags).toEqual(['api', 'reference']);
      expect(metadata.custom).toEqual({ category: 'technical' });
    });

    it('should handle empty content', async () => {
      const metadata = await processor.extractMetadata('', '/empty.md');

      expect(metadata.title).toBe('empty');
      expect(metadata.size).toBe(0);
    });

    it('should extract title from first header if no frontmatter', async () => {
      const content = `Some intro text

# Actual Title Here

More content`;

      const metadata = await processor.extractMetadata(content, '/test.md');

      expect(metadata.title).toBe('Actual Title Here');
    });

    it('should handle malformed frontmatter gracefully', async () => {
      const content = `---
title: Incomplete
author: 
---

# Doc`;

      const metadata = await processor.extractMetadata(content, '/test.md');

      expect(metadata.title).toBe('Incomplete');
      expect(metadata.author).toBeUndefined();
    });
  });

  describe('edge cases', () => {
    it('should handle empty documents', async () => {
      const doc = await processor.parse('', '/empty.md');
      expect(doc.content).toBe('');
      expect(doc.metadata.title).toBe('empty');

      const chunks = await processor.chunk(doc, 100, 10);
      expect(chunks).toHaveLength(0);
    });

    it('should handle very large documents', async () => {
      const largeContent = 'Large paragraph. '.repeat(1000);
      const doc: RAGDocument = {
        id: 'doc-large',
        path: '/large.md',
        content: largeContent,
        metadata: {
          title: 'Large',
          lastModified: new Date().toISOString(),
          size: largeContent.length
        }
      };

      const chunks = await processor.chunk(doc, 200, 20);
      
      expect(chunks.length).toBeGreaterThan(10);
      chunks.forEach(chunk => {
        expect(chunk.content.length).toBeLessThanOrEqual(220);
        expect(chunk.documentId).toBe('doc-large');
      });
    });

    it('should handle special markdown characters', async () => {
      const content = `# Title with **bold** and *italic*

List:
- Item 1
- Item 2

> Blockquote here

[Link](https://example.com)`;

      const doc = await processor.parse(content, '/special.md');
      expect(doc.metadata.title).toBe('Title with **bold** and *italic*');

      const chunks = await processor.chunk(doc, 100, 10);
      const blockquoteChunk = chunks.find(c => c.content.includes('> Blockquote'));
      expect(blockquoteChunk?.metadata.type).toBe('blockquote');
    });
  });
});