72 lines
1.8 KiB
TypeScript
72 lines
1.8 KiB
TypeScript
import { Injectable } from '@nestjs/common';
|
|
|
|
export type SemanticChunk = {
|
|
chunkIndex: number;
|
|
sourceKind: 'base_record' | 'comment' | 'mixed';
|
|
sourceRefId: string | null;
|
|
text: string;
|
|
metadata: Record<string, any>;
|
|
};
|
|
|
|
@Injectable()
|
|
export class SemanticChunkerService {
|
|
chunkText(
|
|
baseNarrative: string,
|
|
comments: Array<{ id: string; content: string }>,
|
|
): SemanticChunk[] {
|
|
const chunks: SemanticChunk[] = [];
|
|
|
|
const baseParts = this.splitText(baseNarrative);
|
|
for (const [index, text] of baseParts.entries()) {
|
|
chunks.push({
|
|
chunkIndex: chunks.length,
|
|
sourceKind: 'base_record',
|
|
sourceRefId: null,
|
|
text,
|
|
metadata: { section: 'base', localIndex: index },
|
|
});
|
|
}
|
|
|
|
for (const comment of comments || []) {
|
|
const commentParts = this.splitText(comment.content);
|
|
for (const [index, text] of commentParts.entries()) {
|
|
chunks.push({
|
|
chunkIndex: chunks.length,
|
|
sourceKind: 'comment',
|
|
sourceRefId: comment.id,
|
|
text,
|
|
metadata: { section: 'comment', localIndex: index, commentId: comment.id },
|
|
});
|
|
}
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
private splitText(text: string): string[] {
|
|
const normalized = (text || '').trim();
|
|
if (!normalized) return [];
|
|
|
|
const paragraphs = normalized
|
|
.split(/\n{2,}/)
|
|
.map((part) => part.trim())
|
|
.filter(Boolean);
|
|
|
|
const chunks: string[] = [];
|
|
for (const paragraph of paragraphs) {
|
|
if (paragraph.length <= 500) {
|
|
chunks.push(paragraph);
|
|
continue;
|
|
}
|
|
|
|
let cursor = 0;
|
|
while (cursor < paragraph.length) {
|
|
chunks.push(paragraph.slice(cursor, cursor + 500).trim());
|
|
cursor += 500;
|
|
}
|
|
}
|
|
|
|
return chunks.filter(Boolean);
|
|
}
|
|
}
|