export interface ExtractionBlock {
  BlockType: string;
  Confidence: number;
  Geometry: {
    BoundingBox: {
      Height: number;
      Left: number;
      Top: number;
      Width: number;
    };
    Polygon: Array<{ X: number; Y: number }>;
  };
  Id: string;
  Page: number;
  Text?: string;
  Relationships?: Array<{ Type: string; Ids: string[] }>;
}

interface HierarchicalMetadata {
  BoundingBox: {
    Height: number;
    Left: number;
    Top: number;
    Width: number;
  };
  Polygon: Array<{ x: number; y: number }>;
  pageNumber: number;
  fileName: string;
  createdAt: string;
}

export interface LangChainDocument {
  pageContent: string;
  metadata: HierarchicalMetadata;
}

class ExtractionProcessor {
  private pageToBlocksMap: Map<number, ExtractionBlock[]> = new Map();
  private pageBlocks: ExtractionBlock[] = [];

  constructor(private extractionData: ExtractionBlock[]) {
    if (!Array.isArray(extractionData)) {
      throw new TypeError('extractionData must be an array');
    }
    this.processExtractionData();
  }

  private processExtractionData(): void {
    this.extractionData.map(block => {
      if (!this.pageToBlocksMap.has(block.Page)) {
        this.pageToBlocksMap.set(block.Page, []);
      }
      this.pageToBlocksMap.get(block.Page)!.push(block);

      if (block.BlockType === 'PAGE') {
        this.pageBlocks.push(block);
      }
    });
  }

  private getLineBlocksForPage(pageNumber: number): ExtractionBlock[] {
    return this.pageToBlocksMap.get(pageNumber)?.filter(block => block.BlockType === 'LINE') || [];
  }

  private sortBlocksByPosition(blocks: ExtractionBlock[]): ExtractionBlock[] {
    return blocks.sort((a, b) => {
      if (a.Geometry.BoundingBox.Top !== b.Geometry.BoundingBox.Top) {
        return a.Geometry.BoundingBox.Top - b.Geometry.BoundingBox.Top;
      }
      return a.Geometry.BoundingBox.Left - b.Geometry.BoundingBox.Left;
    });
  }

  private createMetadata(pageBlock: ExtractionBlock, fileName: string): HierarchicalMetadata {
    const createdAt = new Date().toISOString();
    return {
      BoundingBox: pageBlock.Geometry.BoundingBox,
      Polygon: pageBlock.Geometry.Polygon.map(p => ({ x: p.X, y: p.Y })),
      pageNumber: pageBlock.Page,
      fileName,
      createdAt,
      createdAt,
    };
  }

  public convertToLangChainDocuments(fileName: string): LangChainDocument[] {
    const documents: LangChainDocument[] = [];

    this.pageBlocks.map(pageBlock => {
      const lineBlocks = this.getLineBlocksForPage(pageBlock.Page);
      const sortedLineBlocks = this.sortBlocksByPosition(lineBlocks);
      const pageContent = sortedLineBlocks.map(block => block.Text).join(' ');
      const metadata = this.createMetadata(pageBlock, fileName);
      console.log('metadata', metadata);

      documents.push({
        pageContent,
        metadata,
      });
    });

    return documents;
  }
}

export function convertTextractToLangChain(extractionData: ExtractionBlock[], fileName: string = '', batchSize: number = 1000): LangChainDocument[] {
  const allDocuments: LangChainDocument[] = [];
  for (let i = 0; i < extractionData.length; i += batchSize) {
    const batch = extractionData.slice(i, i + batchSize);
    const processor = new ExtractionProcessor(batch);
    allDocuments.push(...processor.convertToLangChainDocuments(fileName));
  }
  return allDocuments;
}
