WIP - some progress with semantic linking but still needs a lot of work

2026-04-11 23:30:25 +02:00
parent 12b0a0881e
commit 320f8c4266
5 changed files with 220 additions and 17 deletions
--- a/backend/src/knowledge/adapters/semantic-projection.adapter.ts
+++ b/backend/src/knowledge/adapters/semantic-projection.adapter.ts
@@ -10,6 +10,8 @@ export type SemanticProjection = {
  entityId: string;
  title: string;
  narrative: string;
+  /** Plain text used for embedding — no 'key: value' labels, no comments (chunker handles those separately). */
+  embeddingNarrative: string;
  metadata: Record<string, any>;
  sourceSummary: {
    includedFieldCount: number;
@@ -61,11 +63,17 @@ export class DefaultSemanticProjectionAdapter implements SemanticProjectionAdapt

    const narrative = [fieldNarrative, commentNarrative].filter(Boolean).join('\n\n');

+    // Plain values only — no 'key:' prefixes, no comments (chunker adds those separately).
+    const embeddingNarrative = fieldEntries
+      .map(([, value]) => String(value))
+      .join('\n');
+
    return {
      entityType: input.objectApiName,
      entityId: input.record.id,
      title,
      narrative,
+      embeddingNarrative,
      metadata: {
        objectApiName: input.objectApiName,
        hasComments: (input.comments || []).length > 0,
--- a/backend/src/knowledge/services/comment.service.ts
+++ b/backend/src/knowledge/services/comment.service.ts
@@ -33,6 +33,9 @@ export class CommentService {
      })
      .returning('*');

+    console.log(
+      `[Knowledge] Comment created: ${dto.parentObjectApiName}:${dto.parentRecordId} by ${userId}`,
+    );
    await this.semanticOrchestratorService.refreshRecord(
      tenantId,
      dto.parentObjectApiName,
@@ -63,6 +66,9 @@ export class CommentService {
        updated_at: knex.fn.now(),
      });

+    console.log(
+      `[Knowledge] Comment updated: ${existing.parent_object_api_name}:${existing.parent_record_id} by ${userId}`,
+    );
    await this.semanticOrchestratorService.refreshRecord(
      tenantId,
      existing.parent_object_api_name,
@@ -88,6 +94,9 @@ export class CommentService {

    await knex('comments').where({ id: commentId }).delete();

+    console.log(
+      `[Knowledge] Comment deleted: ${existing.parent_object_api_name}:${existing.parent_record_id} by ${userId}`,
+    );
    await this.semanticOrchestratorService.refreshRecord(
      tenantId,
      existing.parent_object_api_name,
--- a/backend/src/knowledge/services/semantic-link.service.ts
+++ b/backend/src/knowledge/services/semantic-link.service.ts
@@ -51,7 +51,10 @@ export class SemanticLinkService {
    );

    const payload = {
-      ...normalized,
+      source_entity_type: normalized.sourceEntityType,
+      source_entity_id: normalized.sourceEntityId,
+      target_entity_type: normalized.targetEntityType,
+      target_entity_id: normalized.targetEntityId,
      link_type: input.linkType || 'related_to',
      status: input.status || 'suggested',
      origin: input.origin || 'semantic',
--- a/backend/src/knowledge/services/semantic-orchestrator.service.ts
+++ b/backend/src/knowledge/services/semantic-orchestrator.service.ts
@@ -3,6 +3,7 @@ import { TenantDatabaseService } from '../../tenant/tenant-database.service';
 import { MeilisearchService } from '../../search/meilisearch.service';
 import { getCentralPrisma } from '../../prisma/central-prisma.service';
 import { OpenAIConfig } from '../../voice/interfaces/integration-config.interface';
+import { randomUUID } from 'crypto';
 import {
  DefaultSemanticProjectionAdapter,
  SemanticProjectionAdapter,
@@ -16,7 +17,7 @@ export class SemanticOrchestratorService {
  private readonly adapters: SemanticProjectionAdapter[] = [new DefaultSemanticProjectionAdapter()];
  private readonly defaultEmbeddingModel =
    process.env.OPENAI_EMBEDDING_MODEL || 'text-embedding-3-small';
-  private readonly semanticEmbedderName = 'semantic-openai';
+  private readonly semanticEmbedderName = 'default';

  constructor(
    private readonly tenantDbService: TenantDatabaseService,
@@ -32,6 +33,9 @@ export class SemanticOrchestratorService {
    userId?: string,
    trigger: string = 'manual',
  ) {
+    this.logger.log(
+      `Semantic refresh start: ${objectApiName}:${recordId} (trigger=${trigger})`,
+    );
    const resolvedTenantId = await this.tenantDbService.resolveTenantId(tenantId);
    const knex = await this.tenantDbService.getTenantKnexById(resolvedTenantId);

@@ -44,6 +48,7 @@ export class SemanticOrchestratorService {
    const tableName = this.getTableName(objectDefinition);
    const record = await knex(tableName).where({ id: recordId }).first();
    if (!record) {
+      this.logger.warn(`Record not found for semantic refresh: ${objectApiName}:${recordId}`);
      return { skipped: true };
    }

@@ -53,6 +58,9 @@ export class SemanticOrchestratorService {
        parent_record_id: recordId,
      })
      .orderBy('created_at', 'asc');
+    this.logger.log(
+      `Semantic refresh source: ${objectApiName}:${recordId} comments=${comments.length}`,
+    );

    const adapter = this.adapters.find((candidate) => candidate.supports(objectApiName))!;
    const projection = adapter.buildProjection({
@@ -63,13 +71,30 @@ export class SemanticOrchestratorService {
    });

    const documentId = await this.upsertSemanticDocument(knex, projection);
-    const chunks = this.chunkerService.chunkText(projection.narrative, comments);
+    // Use embeddingNarrative (plain values, no labels) so lexical noise from 'key:'
+    // prefixes doesn't inflate match scores. Comments are passed separately so they
+    // are not double-counted (narrative already embeds them with 'Comment N:' prefix).
+    const chunks = this.chunkerService.chunkText(projection.embeddingNarrative, comments);
+    this.logger.log(
+      `Semantic refresh chunking: ${objectApiName}:${recordId} chunks=${chunks.length}`,
+    );
    await this.replaceChunks(knex, documentId, chunks);

    const openAiConfig = await this.getOpenAiConfig(resolvedTenantId);
-    await this.indexChunks(resolvedTenantId, projection, chunks, openAiConfig);
-    await this.generateSuggestions(resolvedTenantId, projection, chunks, openAiConfig, userId, trigger);
+    const embedderReady = await this.indexChunks(resolvedTenantId, projection, chunks, openAiConfig);
+    await this.generateSuggestions(
+      resolvedTenantId,
+      projection,
+      chunks,
+      openAiConfig,
+      embedderReady,
+      userId,
+      trigger,
+    );

+    this.logger.log(
+      `Semantic refresh complete: ${objectApiName}:${recordId} document=${documentId}`,
+    );
    return { documentId, chunkCount: chunks.length };
  }

@@ -111,8 +136,10 @@ export class SemanticOrchestratorService {
      return existing.id;
    }

+    const newId = randomUUID();
    const [created] = await knex('semantic_documents')
      .insert({
+        id: newId,
        entity_type: projection.entityType,
        entity_id: projection.entityId,
        title: projection.title,
@@ -124,10 +151,18 @@ export class SemanticOrchestratorService {
      })
      .returning('id');

-    return typeof created === 'string' ? created : created.id;
+    if (created && typeof created === 'object' && created.id) {
+      return created.id;
+    }
+    // MySQL may return a numeric insert id (often 0 for UUID PKs). Always trust the generated UUID.
+    return newId;
  }

  private async replaceChunks(knex: any, documentId: string, chunks: any[]) {
+    if (!documentId) {
+      this.logger.warn('Skipping chunk replace: missing semantic document id.');
+      return;
+    }
    await knex('semantic_chunks').where({ semantic_document_id: documentId }).delete();
    if (!chunks.length) return;

@@ -152,20 +187,28 @@ export class SemanticOrchestratorService {
    openAiConfig: OpenAIConfig | null,
  ) {
    if (!this.meilisearchService.isEnabled()) {
-      return;
+      this.logger.warn('Meilisearch disabled; skipping semantic chunk indexing.');
+      return false;
    }

    const indexName = this.meilisearchService.buildSemanticChunkIndexName(tenantId);
+    let embedderReady = false;
    if (openAiConfig?.apiKey) {
-      await this.meilisearchService.ensureOpenAiEmbedder(indexName, {
+      embedderReady = await this.meilisearchService.ensureOpenAiEmbedder(indexName, {
        embedderName: this.semanticEmbedderName,
        apiKey: openAiConfig.apiKey,
        model: openAiConfig.embeddingModel || this.defaultEmbeddingModel,
        documentTemplate: '{{doc.title}}\n{{doc.text}}',
      });
+      this.logger.log(
+        `Meilisearch embedder ensured: index=${indexName} model=${openAiConfig.embeddingModel || this.defaultEmbeddingModel}`,
+      );
+    } else {
+      this.logger.warn('OpenAI embedder not configured; semantic search will be lexical only.');
    }
+    this.logger.log(`Indexing semantic chunks: index=${indexName} count=${chunks.length}`);
    await this.meilisearchService.upsertDocuments(indexName, chunks.map((chunk) => ({
-      id: `${projection.entityType}:${projection.entityId}:${chunk.chunkIndex}`,
+      id: `${projection.entityType}_${projection.entityId}_${chunk.chunkIndex}`,
      entityType: projection.entityType,
      entityId: projection.entityId,
      title: projection.title,
@@ -173,6 +216,7 @@ export class SemanticOrchestratorService {
      sourceRefId: chunk.sourceRefId,
      text: chunk.text,
    })));
+    return embedderReady;
  }

  private async generateSuggestions(
@@ -180,20 +224,32 @@ export class SemanticOrchestratorService {
    projection: any,
    chunks: any[],
    openAiConfig: OpenAIConfig | null,
+    embedderReady: boolean,
    userId?: string,
    trigger: string = 'semantic_refresh',
  ) {
    if (!this.meilisearchService.isEnabled() || !chunks.length) {
+      this.logger.warn(
+        `Skipping suggestion generation: meili=${this.meilisearchService.isEnabled()} chunks=${chunks.length}`,
+      );
      return;
    }

    const indexName = this.meilisearchService.buildSemanticChunkIndexName(tenantId);
    const queryText = chunks.slice(0, 3).map((chunk) => chunk.text).join(' ').slice(0, 1200);
+    this.logger.log(
+      `Generating suggestions: index=${indexName} queryLen=${queryText.length} hybrid=${embedderReady}`,
+    );
    const search = await this.meilisearchService.searchIndex(
      indexName,
      queryText,
      20,
-      openAiConfig?.apiKey ? { embedder: this.semanticEmbedderName } : undefined,
+      // semanticRatio:1.0 = pure vector search, no lexical component that would
+      // match on shared tokens like 'name:' or 'Comment 1:' across all records.
+      embedderReady ? { embedder: this.semanticEmbedderName, semanticRatio: 1.0 } : undefined,
+    );
+    this.logger.log(
+      `Meilisearch results: index=${indexName} hits=${search.hits?.length || 0} total=${search.total}`,
    );

    const grouped = new Map<string, any[]>();
@@ -201,6 +257,10 @@ export class SemanticOrchestratorService {
      if (hit.entityType === projection.entityType && hit.entityId === projection.entityId) {
        continue;
      }
+      // Skip self-links where source and target resolve to the same entity
+      if (hit.entityId === projection.entityId) {
+        continue;
+      }
      const key = `${hit.entityType}:${hit.entityId}`;
      if (!grouped.has(key)) grouped.set(key, []);
      grouped.get(key).push(hit);
--- a/backend/src/search/meilisearch.service.ts
+++ b/backend/src/search/meilisearch.service.ts
@@ -24,6 +24,7 @@ type OpenAiEmbedderConfig = {
 export class MeilisearchService {
  private readonly logger = new Logger(MeilisearchService.name);
  private readonly embedderCache = new Map<string, string>();
+  private vectorStoreEnabled = false;

  isEnabled(): boolean {
    return Boolean(this.getConfig());
@@ -186,6 +187,16 @@ export class MeilisearchService {
      const response = await this.requestJson('POST', url, documents, this.buildHeaders(config));
      if (!this.isSuccessStatus(response.status)) {
        this.logger.warn(`Meilisearch document upsert failed for index ${indexName}: ${response.status}`);
+        return;
+      }
+      // Meilisearch indexes (and embeds) documents asynchronously. Wait for the task
+      // to complete so callers can immediately search and see the new documents.
+      const taskUid = response.body?.taskUid ?? response.body?.uid;
+      if (Number.isFinite(Number(taskUid))) {
+        const succeeded = await this.waitForTask(config, Number(taskUid), 30000);
+        if (!succeeded) {
+          this.logger.warn(`Meilisearch indexing task did not succeed within timeout: taskUid=${taskUid} index=${indexName}`);
+        }
      }
    } catch (error) {
      this.logger.warn(`Meilisearch document upsert failed: ${error.message}`);
@@ -215,7 +226,33 @@ export class MeilisearchService {
      );

      if (!this.isSuccessStatus(response.status)) {
-        this.logger.warn(`Meilisearch search failed for index ${indexName}: ${response.status}`);
+        this.logger.warn(
+          `Meilisearch search failed for index ${indexName}: ${response.status}`,
+        );
+        this.logger.warn(
+          `Meilisearch search payload: ${JSON.stringify({ q: query, limit, hybrid })}`,
+        );
+        this.logger.warn(
+          `Meilisearch search error body: ${JSON.stringify(response.body)}`,
+        );
+        // If hybrid is invalid (embedder missing), retry once without hybrid
+        if (hybrid && response.body?.code === 'invalid_embedder') {
+          const fallback = await this.requestJson(
+            'POST',
+            url,
+            { q: query, limit },
+            this.buildHeaders(config),
+          );
+          if (this.isSuccessStatus(fallback.status)) {
+            const hits = Array.isArray(fallback.body?.hits) ? fallback.body.hits : [];
+            const total =
+              fallback.body?.estimatedTotalHits ?? fallback.body?.nbHits ?? hits.length;
+            this.logger.warn(
+              `Meilisearch hybrid failed; fell back to lexical search for index ${indexName}.`,
+            );
+            return { hits, total };
+          }
+        }
        return { hits: [], total: 0 };
      }

@@ -268,7 +305,7 @@ export class MeilisearchService {
  }

  private requestJson(
-    method: 'POST' | 'DELETE' | 'PATCH',
+    method: 'POST' | 'DELETE' | 'PATCH' | 'GET',
    url: string,
    payload: any,
    headers: Record<string, string>,
@@ -305,19 +342,49 @@ export class MeilisearchService {
      );

      request.on('error', reject);
-      if (payload !== undefined) {
+      if (payload !== undefined && method !== 'GET') {
        request.write(JSON.stringify(payload));
      }
      request.end();
    });
  }

+  private async enableVectorStore(): Promise<void> {
+    // Temporarily disabled to avoid the overhead of checking on every save.
+    // Re-enable by removing the early return below.
+    return;
+    if (this.vectorStoreEnabled) return; // eslint-disable-line no-unreachable
+    const meiliConfig = this.getConfig();
+    if (!meiliConfig) return;
+    const url = `${meiliConfig.host}/experimental-features`;
+    try {
+      const response = await this.requestJson(
+        'PATCH',
+        url,
+        { vectorStore: true },
+        this.buildHeaders(meiliConfig),
+      );
+      if (this.isSuccessStatus(response.status)) {
+        this.vectorStoreEnabled = true;
+        this.logger.log('Meilisearch vector store experimental feature enabled');
+      } else {
+        this.logger.warn(
+          `Failed to enable Meilisearch vector store: ${response.status} ${JSON.stringify(response.body)}`,
+        );
+      }
+    } catch (error) {
+      this.logger.warn(`Failed to enable Meilisearch vector store: ${error.message}`);
+    }
+  }
+
  async ensureOpenAiEmbedder(
    indexName: string,
    config: OpenAiEmbedderConfig,
-  ): Promise<void> {
+  ): Promise<boolean> {
    const meiliConfig = this.getConfig();
-    if (!meiliConfig || !config?.apiKey) return;
+    if (!meiliConfig || !config?.apiKey) return false;
+
+    await this.enableVectorStore();

    const signature = JSON.stringify({
      embedderName: config.embedderName,
@@ -327,7 +394,7 @@ export class MeilisearchService {
    });
    const cacheKey = `${indexName}:${config.embedderName}`;
    if (this.embedderCache.get(cacheKey) === signature) {
-      return;
+      return true;
    }

    const url = `${meiliConfig.host}/indexes/${encodeURIComponent(indexName)}/settings/embedders`;
@@ -349,11 +416,67 @@ export class MeilisearchService {
        this.logger.warn(
          `Meilisearch embedder update failed for index ${indexName}: ${response.status}`,
        );
-        return;
+        this.logger.warn(
+          `Meilisearch embedder error body: ${JSON.stringify(response.body)}`,
+        );
+        return false;
+      }
+      const taskUid = response.body?.taskUid ?? response.body?.uid;
+      if (Number.isFinite(Number(taskUid))) {
+        const succeeded = await this.waitForTask(meiliConfig, Number(taskUid), 8000);
+        if (!succeeded) {
+          this.logger.warn(`Meilisearch embedder task did not succeed: ${taskUid}`);
+          return false;
+        }
+      }
+
+      const hasEmbedder = await this.hasEmbedder(meiliConfig, indexName, config.embedderName);
+      if (!hasEmbedder) {
+        this.logger.warn(`Meilisearch embedder missing after update: ${config.embedderName}`);
+        return false;
      }
      this.embedderCache.set(cacheKey, signature);
+      return true;
    } catch (error) {
      this.logger.warn(`Meilisearch embedder update failed: ${error.message}`);
+      return false;
    }
  }
+
+  private async waitForTask(
+    config: MeiliConfig,
+    taskUid: number,
+    timeoutMs = 8000,
+  ): Promise<boolean> {
+    const url = `${config.host}/tasks/${taskUid}`;
+    const start = Date.now();
+    while (Date.now() - start < timeoutMs) {
+      const response = await this.requestJson('GET', url, undefined, this.buildHeaders(config));
+      if (!this.isSuccessStatus(response.status)) {
+        return false;
+      }
+      const status = response.body?.status;
+      if (status === 'succeeded') return true;
+      if (status === 'failed' || status === 'canceled') {
+        this.logger.warn(`Meilisearch task ${taskUid} failed: ${JSON.stringify(response.body?.error)}`);
+        return false;
+      }
+      await new Promise((resolve) => setTimeout(resolve, 300));
+    }
+    return false;
+  }
+
+  private async hasEmbedder(
+    config: MeiliConfig,
+    indexName: string,
+    embedderName: string,
+  ): Promise<boolean> {
+    const url = `${config.host}/indexes/${encodeURIComponent(indexName)}/settings/embedders`;
+    const response = await this.requestJson('GET', url, undefined, this.buildHeaders(config));
+    if (!this.isSuccessStatus(response.status)) {
+      return false;
+    }
+    const embedders = response.body || {};
+    return Boolean(embedders && embedders[embedderName]);
+  }
 }