import { Injectable } from '@nestjs/common'; export type SemanticChunk = { chunkIndex: number; sourceKind: 'base_record' | 'comment' | 'mixed'; sourceRefId: string | null; text: string; metadata: Record; }; @Injectable() export class SemanticChunkerService { chunkText( baseNarrative: string, comments: Array<{ id: string; content: string }>, ): SemanticChunk[] { const chunks: SemanticChunk[] = []; const baseParts = this.splitText(baseNarrative); for (const [index, text] of baseParts.entries()) { chunks.push({ chunkIndex: chunks.length, sourceKind: 'base_record', sourceRefId: null, text, metadata: { section: 'base', localIndex: index }, }); } for (const comment of comments || []) { const commentParts = this.splitText(comment.content); for (const [index, text] of commentParts.entries()) { chunks.push({ chunkIndex: chunks.length, sourceKind: 'comment', sourceRefId: comment.id, text, metadata: { section: 'comment', localIndex: index, commentId: comment.id }, }); } } return chunks; } private splitText(text: string): string[] { const normalized = (text || '').trim(); if (!normalized) return []; const paragraphs = normalized .split(/\n{2,}/) .map((part) => part.trim()) .filter(Boolean); const chunks: string[] = []; for (const paragraph of paragraphs) { if (paragraph.length <= 500) { chunks.push(paragraph); continue; } let cursor = 0; while (cursor < paragraph.length) { chunks.push(paragraph.slice(cursor, cursor + 500).trim()); cursor += 500; } } return chunks.filter(Boolean); } }