Add phased knowledge layer: comments and semantic linking pipeline
This commit is contained in:
71
backend/src/knowledge/services/semantic-chunker.service.ts
Normal file
71
backend/src/knowledge/services/semantic-chunker.service.ts
Normal file
@@ -0,0 +1,71 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
|
||||
export type SemanticChunk = {
|
||||
chunkIndex: number;
|
||||
sourceKind: 'base_record' | 'comment' | 'mixed';
|
||||
sourceRefId: string | null;
|
||||
text: string;
|
||||
metadata: Record<string, any>;
|
||||
};
|
||||
|
||||
@Injectable()
|
||||
export class SemanticChunkerService {
|
||||
chunkText(
|
||||
baseNarrative: string,
|
||||
comments: Array<{ id: string; content: string }>,
|
||||
): SemanticChunk[] {
|
||||
const chunks: SemanticChunk[] = [];
|
||||
|
||||
const baseParts = this.splitText(baseNarrative);
|
||||
for (const [index, text] of baseParts.entries()) {
|
||||
chunks.push({
|
||||
chunkIndex: chunks.length,
|
||||
sourceKind: 'base_record',
|
||||
sourceRefId: null,
|
||||
text,
|
||||
metadata: { section: 'base', localIndex: index },
|
||||
});
|
||||
}
|
||||
|
||||
for (const comment of comments || []) {
|
||||
const commentParts = this.splitText(comment.content);
|
||||
for (const [index, text] of commentParts.entries()) {
|
||||
chunks.push({
|
||||
chunkIndex: chunks.length,
|
||||
sourceKind: 'comment',
|
||||
sourceRefId: comment.id,
|
||||
text,
|
||||
metadata: { section: 'comment', localIndex: index, commentId: comment.id },
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
private splitText(text: string): string[] {
|
||||
const normalized = (text || '').trim();
|
||||
if (!normalized) return [];
|
||||
|
||||
const paragraphs = normalized
|
||||
.split(/\n{2,}/)
|
||||
.map((part) => part.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
const chunks: string[] = [];
|
||||
for (const paragraph of paragraphs) {
|
||||
if (paragraph.length <= 500) {
|
||||
chunks.push(paragraph);
|
||||
continue;
|
||||
}
|
||||
|
||||
let cursor = 0;
|
||||
while (cursor < paragraph.length) {
|
||||
chunks.push(paragraph.slice(cursor, cursor + 500).trim());
|
||||
cursor += 500;
|
||||
}
|
||||
}
|
||||
|
||||
return chunks.filter(Boolean);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user