WIP - some progress with semantic linking but still needs a lot of work

This commit is contained in:
Francisco Gaona
2026-04-11 23:30:25 +02:00
parent 12b0a0881e
commit 320f8c4266
5 changed files with 220 additions and 17 deletions

View File

@@ -3,6 +3,7 @@ import { TenantDatabaseService } from '../../tenant/tenant-database.service';
import { MeilisearchService } from '../../search/meilisearch.service';
import { getCentralPrisma } from '../../prisma/central-prisma.service';
import { OpenAIConfig } from '../../voice/interfaces/integration-config.interface';
import { randomUUID } from 'crypto';
import {
DefaultSemanticProjectionAdapter,
SemanticProjectionAdapter,
@@ -16,7 +17,7 @@ export class SemanticOrchestratorService {
private readonly adapters: SemanticProjectionAdapter[] = [new DefaultSemanticProjectionAdapter()];
private readonly defaultEmbeddingModel =
process.env.OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small';
private readonly semanticEmbedderName = 'semantic-openai';
private readonly semanticEmbedderName = 'default';
constructor(
private readonly tenantDbService: TenantDatabaseService,
@@ -32,6 +33,9 @@ export class SemanticOrchestratorService {
userId?: string,
trigger: string = 'manual',
) {
this.logger.log(
`Semantic refresh start: ${objectApiName}:${recordId} (trigger=${trigger})`,
);
const resolvedTenantId = await this.tenantDbService.resolveTenantId(tenantId);
const knex = await this.tenantDbService.getTenantKnexById(resolvedTenantId);
@@ -44,6 +48,7 @@ export class SemanticOrchestratorService {
const tableName = this.getTableName(objectDefinition);
const record = await knex(tableName).where({ id: recordId }).first();
if (!record) {
this.logger.warn(`Record not found for semantic refresh: ${objectApiName}:${recordId}`);
return { skipped: true };
}
@@ -53,6 +58,9 @@ export class SemanticOrchestratorService {
parent_record_id: recordId,
})
.orderBy('created_at', 'asc');
this.logger.log(
`Semantic refresh source: ${objectApiName}:${recordId} comments=${comments.length}`,
);
const adapter = this.adapters.find((candidate) => candidate.supports(objectApiName))!;
const projection = adapter.buildProjection({
@@ -63,13 +71,30 @@ export class SemanticOrchestratorService {
});
const documentId = await this.upsertSemanticDocument(knex, projection);
const chunks = this.chunkerService.chunkText(projection.narrative, comments);
// Use embeddingNarrative (plain values, no labels) so lexical noise from 'key:'
// prefixes doesn't inflate match scores. Comments are passed separately so they
// are not double-counted (narrative already embeds them with 'Comment N:' prefix).
const chunks = this.chunkerService.chunkText(projection.embeddingNarrative, comments);
this.logger.log(
`Semantic refresh chunking: ${objectApiName}:${recordId} chunks=${chunks.length}`,
);
await this.replaceChunks(knex, documentId, chunks);
const openAiConfig = await this.getOpenAiConfig(resolvedTenantId);
await this.indexChunks(resolvedTenantId, projection, chunks, openAiConfig);
await this.generateSuggestions(resolvedTenantId, projection, chunks, openAiConfig, userId, trigger);
const embedderReady = await this.indexChunks(resolvedTenantId, projection, chunks, openAiConfig);
await this.generateSuggestions(
resolvedTenantId,
projection,
chunks,
openAiConfig,
embedderReady,
userId,
trigger,
);
this.logger.log(
`Semantic refresh complete: ${objectApiName}:${recordId} document=${documentId}`,
);
return { documentId, chunkCount: chunks.length };
}
@@ -111,8 +136,10 @@ export class SemanticOrchestratorService {
return existing.id;
}
const newId = randomUUID();
const [created] = await knex('semantic_documents')
.insert({
id: newId,
entity_type: projection.entityType,
entity_id: projection.entityId,
title: projection.title,
@@ -124,10 +151,18 @@ export class SemanticOrchestratorService {
})
.returning('id');
return typeof created === 'string' ? created : created.id;
if (created && typeof created === 'object' && created.id) {
return created.id;
}
// MySQL may return a numeric insert id (often 0 for UUID PKs). Always trust the generated UUID.
return newId;
}
private async replaceChunks(knex: any, documentId: string, chunks: any[]) {
if (!documentId) {
this.logger.warn('Skipping chunk replace: missing semantic document id.');
return;
}
await knex('semantic_chunks').where({ semantic_document_id: documentId }).delete();
if (!chunks.length) return;
@@ -152,20 +187,28 @@ export class SemanticOrchestratorService {
openAiConfig: OpenAIConfig | null,
) {
if (!this.meilisearchService.isEnabled()) {
return;
this.logger.warn('Meilisearch disabled; skipping semantic chunk indexing.');
return false;
}
const indexName = this.meilisearchService.buildSemanticChunkIndexName(tenantId);
let embedderReady = false;
if (openAiConfig?.apiKey) {
await this.meilisearchService.ensureOpenAiEmbedder(indexName, {
embedderReady = await this.meilisearchService.ensureOpenAiEmbedder(indexName, {
embedderName: this.semanticEmbedderName,
apiKey: openAiConfig.apiKey,
model: openAiConfig.embeddingModel || this.defaultEmbeddingModel,
documentTemplate: '{{doc.title}}\n{{doc.text}}',
});
this.logger.log(
`Meilisearch embedder ensured: index=${indexName} model=${openAiConfig.embeddingModel || this.defaultEmbeddingModel}`,
);
} else {
this.logger.warn('OpenAI embedder not configured; semantic search will be lexical only.');
}
this.logger.log(`Indexing semantic chunks: index=${indexName} count=${chunks.length}`);
await this.meilisearchService.upsertDocuments(indexName, chunks.map((chunk) => ({
id: `${projection.entityType}:${projection.entityId}:${chunk.chunkIndex}`,
id: `${projection.entityType}_${projection.entityId}_${chunk.chunkIndex}`,
entityType: projection.entityType,
entityId: projection.entityId,
title: projection.title,
@@ -173,6 +216,7 @@ export class SemanticOrchestratorService {
sourceRefId: chunk.sourceRefId,
text: chunk.text,
})));
return embedderReady;
}
private async generateSuggestions(
@@ -180,20 +224,32 @@ export class SemanticOrchestratorService {
projection: any,
chunks: any[],
openAiConfig: OpenAIConfig | null,
embedderReady: boolean,
userId?: string,
trigger: string = 'semantic_refresh',
) {
if (!this.meilisearchService.isEnabled() || !chunks.length) {
this.logger.warn(
`Skipping suggestion generation: meili=${this.meilisearchService.isEnabled()} chunks=${chunks.length}`,
);
return;
}
const indexName = this.meilisearchService.buildSemanticChunkIndexName(tenantId);
const queryText = chunks.slice(0, 3).map((chunk) => chunk.text).join(' ').slice(0, 1200);
this.logger.log(
`Generating suggestions: index=${indexName} queryLen=${queryText.length} hybrid=${embedderReady}`,
);
const search = await this.meilisearchService.searchIndex(
indexName,
queryText,
20,
openAiConfig?.apiKey ? { embedder: this.semanticEmbedderName } : undefined,
// semanticRatio:1.0 = pure vector search, no lexical component that would
// match on shared tokens like 'name:' or 'Comment 1:' across all records.
embedderReady ? { embedder: this.semanticEmbedderName, semanticRatio: 1.0 } : undefined,
);
this.logger.log(
`Meilisearch results: index=${indexName} hits=${search.hits?.length || 0} total=${search.total}`,
);
const grouped = new Map<string, any[]>();
@@ -201,6 +257,10 @@ export class SemanticOrchestratorService {
if (hit.entityType === projection.entityType && hit.entityId === projection.entityId) {
continue;
}
// Skip self-links where source and target resolve to the same entity
if (hit.entityId === projection.entityId) {
continue;
}
const key = `${hit.entityType}:${hit.entityId}`;
if (!grouped.has(key)) grouped.set(key, []);
grouped.get(key).push(hit);