Files
neo/backend/src/knowledge/services/semantic-chunker.service.ts

72 lines
1.8 KiB
TypeScript

import { Injectable } from '@nestjs/common';
export type SemanticChunk = {
chunkIndex: number;
sourceKind: 'base_record' | 'comment' | 'mixed';
sourceRefId: string | null;
text: string;
metadata: Record<string, any>;
};
@Injectable()
export class SemanticChunkerService {
chunkText(
baseNarrative: string,
comments: Array<{ id: string; content: string }>,
): SemanticChunk[] {
const chunks: SemanticChunk[] = [];
const baseParts = this.splitText(baseNarrative);
for (const [index, text] of baseParts.entries()) {
chunks.push({
chunkIndex: chunks.length,
sourceKind: 'base_record',
sourceRefId: null,
text,
metadata: { section: 'base', localIndex: index },
});
}
for (const comment of comments || []) {
const commentParts = this.splitText(comment.content);
for (const [index, text] of commentParts.entries()) {
chunks.push({
chunkIndex: chunks.length,
sourceKind: 'comment',
sourceRefId: comment.id,
text,
metadata: { section: 'comment', localIndex: index, commentId: comment.id },
});
}
}
return chunks;
}
private splitText(text: string): string[] {
const normalized = (text || '').trim();
if (!normalized) return [];
const paragraphs = normalized
.split(/\n{2,}/)
.map((part) => part.trim())
.filter(Boolean);
const chunks: string[] = [];
for (const paragraph of paragraphs) {
if (paragraph.length <= 500) {
chunks.push(paragraph);
continue;
}
let cursor = 0;
while (cursor < paragraph.length) {
chunks.push(paragraph.slice(cursor, cursor + 500).trim());
cursor += 500;
}
}
return chunks.filter(Boolean);
}
}