WIP - call connecting to softphone
This commit is contained in:
@@ -3,13 +3,15 @@ import {
|
||||
FastifyAdapter,
|
||||
NestFastifyApplication,
|
||||
} from '@nestjs/platform-fastify';
|
||||
import { ValidationPipe } from '@nestjs/common';
|
||||
import { ValidationPipe, Logger } from '@nestjs/common';
|
||||
import { AppModule } from './app.module';
|
||||
import { VoiceService } from './voice/voice.service';
|
||||
import { AudioConverterService } from './voice/audio-converter.service';
|
||||
|
||||
async function bootstrap() {
|
||||
const app = await NestFactory.create<NestFastifyApplication>(
|
||||
AppModule,
|
||||
new FastifyAdapter({ logger: false }),
|
||||
new FastifyAdapter({ logger: true }),
|
||||
);
|
||||
|
||||
// Global validation pipe
|
||||
@@ -33,6 +35,144 @@ async function bootstrap() {
|
||||
const port = process.env.PORT || 3000;
|
||||
await app.listen(port, '0.0.0.0');
|
||||
|
||||
// After app is listening, register WebSocket handler
|
||||
const fastifyInstance = app.getHttpAdapter().getInstance();
|
||||
const logger = new Logger('MediaStreamWS');
|
||||
const voiceService = app.get(VoiceService);
|
||||
const audioConverter = app.get(AudioConverterService);
|
||||
|
||||
const WebSocketServer = require('ws').Server;
|
||||
const wss = new WebSocketServer({ noServer: true });
|
||||
|
||||
// Handle WebSocket upgrades at the server level
|
||||
const server = (fastifyInstance.server as any);
|
||||
|
||||
// Track active Media Streams connections: streamSid -> WebSocket
|
||||
const mediaStreams: Map<string, any> = new Map();
|
||||
|
||||
server.on('upgrade', (request: any, socket: any, head: any) => {
|
||||
if (request.url === '/api/voice/media-stream') {
|
||||
logger.log('=== MEDIA STREAM WEBSOCKET UPGRADE REQUEST ===');
|
||||
logger.log(`Path: ${request.url}`);
|
||||
|
||||
wss.handleUpgrade(request, socket, head, (ws: any) => {
|
||||
logger.log('=== MEDIA STREAM WEBSOCKET UPGRADED SUCCESSFULLY ===');
|
||||
handleMediaStreamSocket(ws);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
async function handleMediaStreamSocket(ws: any) {
|
||||
let streamSid: string | null = null;
|
||||
let callSid: string | null = null;
|
||||
let tenantDomain: string | null = null;
|
||||
let mediaPacketCount = 0;
|
||||
|
||||
ws.on('message', async (message: Buffer) => {
|
||||
try {
|
||||
const msg = JSON.parse(message.toString());
|
||||
|
||||
switch (msg.event) {
|
||||
case 'connected':
|
||||
logger.log('=== MEDIA STREAM EVENT: CONNECTED ===');
|
||||
logger.log(`Protocol: ${msg.protocol}`);
|
||||
logger.log(`Version: ${msg.version}`);
|
||||
break;
|
||||
|
||||
case 'start':
|
||||
streamSid = msg.streamSid;
|
||||
callSid = msg.start.callSid;
|
||||
tenantDomain = msg.start.customParameters?.tenantId || 'tenant1';
|
||||
|
||||
logger.log(`=== MEDIA STREAM EVENT: START ===`);
|
||||
logger.log(`StreamSid: ${streamSid}`);
|
||||
logger.log(`CallSid: ${callSid}`);
|
||||
logger.log(`Tenant: ${tenantDomain}`);
|
||||
logger.log(`MediaFormat: ${JSON.stringify(msg.start.mediaFormat)}`);
|
||||
|
||||
mediaStreams.set(streamSid, ws);
|
||||
logger.log(`Stored WebSocket for streamSid: ${streamSid}. Total active streams: ${mediaStreams.size}`);
|
||||
|
||||
// Initialize OpenAI Realtime connection
|
||||
logger.log(`Initializing OpenAI Realtime for call ${callSid}...`);
|
||||
try {
|
||||
await voiceService.initializeOpenAIRealtime({
|
||||
callSid,
|
||||
tenantId: tenantDomain,
|
||||
userId: msg.start.customParameters?.userId || 'system',
|
||||
});
|
||||
logger.log(`✓ OpenAI Realtime initialized for call ${callSid}`);
|
||||
} catch (error: any) {
|
||||
logger.error(`Failed to initialize OpenAI: ${error.message}`);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'media':
|
||||
mediaPacketCount++;
|
||||
if (mediaPacketCount % 50 === 0) {
|
||||
logger.log(`Received media packet #${mediaPacketCount} for StreamSid: ${streamSid}`);
|
||||
}
|
||||
|
||||
if (!callSid || !tenantDomain) {
|
||||
logger.warn('Received media before start event');
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
// Convert Twilio audio (μ-law 8kHz) to OpenAI format (PCM16 24kHz)
|
||||
const twilioAudio = msg.media.payload;
|
||||
const openaiAudio = audioConverter.twilioToOpenAI(twilioAudio);
|
||||
|
||||
// Send audio to OpenAI Realtime API
|
||||
await voiceService.sendAudioToOpenAI(callSid, openaiAudio);
|
||||
} catch (error: any) {
|
||||
logger.error(`Error processing media: ${error.message}`);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'stop':
|
||||
logger.log(`=== MEDIA STREAM EVENT: STOP ===`);
|
||||
logger.log(`StreamSid: ${streamSid}`);
|
||||
logger.log(`Total media packets received: ${mediaPacketCount}`);
|
||||
|
||||
if (streamSid) {
|
||||
mediaStreams.delete(streamSid);
|
||||
logger.log(`Removed WebSocket for streamSid: ${streamSid}`);
|
||||
}
|
||||
|
||||
// Clean up OpenAI connection
|
||||
if (callSid) {
|
||||
try {
|
||||
logger.log(`Cleaning up OpenAI connection for call ${callSid}...`);
|
||||
await voiceService.cleanupOpenAIConnection(callSid);
|
||||
logger.log(`✓ OpenAI connection cleaned up`);
|
||||
} catch (error: any) {
|
||||
logger.error(`Failed to cleanup OpenAI: ${error.message}`);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
logger.debug(`Unknown media stream event: ${msg.event}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
logger.error(`Error processing media stream message: ${error.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('close', () => {
|
||||
logger.log(`=== MEDIA STREAM WEBSOCKET CLOSED ===`);
|
||||
if (streamSid) {
|
||||
mediaStreams.delete(streamSid);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (error: Error) => {
|
||||
logger.error(`=== MEDIA STREAM WEBSOCKET ERROR ===`);
|
||||
logger.error(`Error message: ${error.message}`);
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`🚀 Application is running on: http://localhost:${port}/api`);
|
||||
}
|
||||
|
||||
|
||||
214
backend/src/voice/audio-converter.service.ts
Normal file
214
backend/src/voice/audio-converter.service.ts
Normal file
@@ -0,0 +1,214 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
|
||||
/**
|
||||
* Audio format converter for Twilio <-> OpenAI audio streaming
|
||||
*
|
||||
* Twilio Media Streams format:
|
||||
* - Codec: μ-law (G.711)
|
||||
* - Sample rate: 8kHz
|
||||
* - Encoding: base64
|
||||
* - Chunk size: 20ms (160 bytes)
|
||||
*
|
||||
* OpenAI Realtime API format:
|
||||
* - Codec: PCM16
|
||||
* - Sample rate: 24kHz
|
||||
* - Encoding: base64
|
||||
* - Mono channel
|
||||
*/
|
||||
@Injectable()
|
||||
export class AudioConverterService {
|
||||
private readonly logger = new Logger(AudioConverterService.name);
|
||||
|
||||
// μ-law decode lookup table
|
||||
private readonly MULAW_DECODE_TABLE = this.buildMuLawDecodeTable();
|
||||
|
||||
// μ-law encode lookup table
|
||||
private readonly MULAW_ENCODE_TABLE = this.buildMuLawEncodeTable();
|
||||
|
||||
/**
|
||||
* Build μ-law to linear PCM16 decode table
|
||||
*/
|
||||
private buildMuLawDecodeTable(): Int16Array {
|
||||
const table = new Int16Array(256);
|
||||
for (let i = 0; i < 256; i++) {
|
||||
const mulaw = ~i;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
let sample = (mantissa << 3) + 0x84;
|
||||
sample <<= exponent;
|
||||
sample -= 0x84;
|
||||
if ((mulaw & 0x80) === 0) {
|
||||
sample = -sample;
|
||||
}
|
||||
table[i] = sample;
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build linear PCM16 to μ-law encode table
|
||||
*/
|
||||
private buildMuLawEncodeTable(): Uint8Array {
|
||||
const table = new Uint8Array(65536);
|
||||
for (let i = 0; i < 65536; i++) {
|
||||
const sample = (i - 32768);
|
||||
const sign = sample < 0 ? 0x80 : 0x00;
|
||||
const magnitude = Math.abs(sample);
|
||||
|
||||
// Add bias
|
||||
let biased = magnitude + 0x84;
|
||||
|
||||
// Find exponent
|
||||
let exponent = 7;
|
||||
for (let exp = 0; exp < 8; exp++) {
|
||||
if (biased <= (0xff << exp)) {
|
||||
exponent = exp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract mantissa
|
||||
const mantissa = (biased >> (exponent + 3)) & 0x0f;
|
||||
|
||||
// Combine sign, exponent, mantissa
|
||||
const mulaw = ~(sign | (exponent << 4) | mantissa);
|
||||
table[i] = mulaw & 0xff;
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode μ-law audio to linear PCM16
|
||||
* @param mulawData - Buffer containing μ-law encoded audio
|
||||
* @returns Buffer containing PCM16 audio (16-bit little-endian)
|
||||
*/
|
||||
decodeMuLaw(mulawData: Buffer): Buffer {
|
||||
const pcm16 = Buffer.allocUnsafe(mulawData.length * 2);
|
||||
|
||||
for (let i = 0; i < mulawData.length; i++) {
|
||||
const sample = this.MULAW_DECODE_TABLE[mulawData[i]];
|
||||
pcm16.writeInt16LE(sample, i * 2);
|
||||
}
|
||||
|
||||
return pcm16;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode linear PCM16 to μ-law
|
||||
* @param pcm16Data - Buffer containing PCM16 audio (16-bit little-endian)
|
||||
* @returns Buffer containing μ-law encoded audio
|
||||
*/
|
||||
encodeMuLaw(pcm16Data: Buffer): Buffer {
|
||||
const mulaw = Buffer.allocUnsafe(pcm16Data.length / 2);
|
||||
|
||||
for (let i = 0; i < pcm16Data.length; i += 2) {
|
||||
const sample = pcm16Data.readInt16LE(i);
|
||||
const index = (sample + 32768) & 0xffff;
|
||||
mulaw[i / 2] = this.MULAW_ENCODE_TABLE[index];
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample audio from 8kHz to 24kHz (linear interpolation)
|
||||
* @param pcm16Data - Buffer containing 8kHz PCM16 audio
|
||||
* @returns Buffer containing 24kHz PCM16 audio
|
||||
*/
|
||||
resample8kTo24k(pcm16Data: Buffer): Buffer {
|
||||
const inputSamples = pcm16Data.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples * 3); // 8k * 3 = 24k
|
||||
const output = Buffer.allocUnsafe(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcIndex = i / 3;
|
||||
const srcIndexFloor = Math.floor(srcIndex);
|
||||
const srcIndexCeil = Math.min(srcIndexFloor + 1, inputSamples - 1);
|
||||
const fraction = srcIndex - srcIndexFloor;
|
||||
|
||||
const sample1 = pcm16Data.readInt16LE(srcIndexFloor * 2);
|
||||
const sample2 = pcm16Data.readInt16LE(srcIndexCeil * 2);
|
||||
|
||||
// Linear interpolation
|
||||
const interpolated = Math.round(sample1 + (sample2 - sample1) * fraction);
|
||||
output.writeInt16LE(interpolated, i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample audio from 24kHz to 8kHz (decimation with averaging)
|
||||
* @param pcm16Data - Buffer containing 24kHz PCM16 audio
|
||||
* @returns Buffer containing 8kHz PCM16 audio
|
||||
*/
|
||||
resample24kTo8k(pcm16Data: Buffer): Buffer {
|
||||
const inputSamples = pcm16Data.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples / 3); // 24k / 3 = 8k
|
||||
const output = Buffer.allocUnsafe(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
// Average 3 samples for anti-aliasing
|
||||
const idx1 = Math.min(i * 3, inputSamples - 1);
|
||||
const idx2 = Math.min(i * 3 + 1, inputSamples - 1);
|
||||
const idx3 = Math.min(i * 3 + 2, inputSamples - 1);
|
||||
|
||||
const sample1 = pcm16Data.readInt16LE(idx1 * 2);
|
||||
const sample2 = pcm16Data.readInt16LE(idx2 * 2);
|
||||
const sample3 = pcm16Data.readInt16LE(idx3 * 2);
|
||||
|
||||
const averaged = Math.round((sample1 + sample2 + sample3) / 3);
|
||||
output.writeInt16LE(averaged, i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Twilio μ-law 8kHz to OpenAI PCM16 24kHz
|
||||
* @param twilioBase64 - Base64-encoded μ-law audio from Twilio
|
||||
* @returns Base64-encoded PCM16 24kHz audio for OpenAI
|
||||
*/
|
||||
twilioToOpenAI(twilioBase64: string): string {
|
||||
try {
|
||||
// Decode base64
|
||||
const mulawBuffer = Buffer.from(twilioBase64, 'base64');
|
||||
|
||||
// μ-law -> PCM16
|
||||
const pcm16_8k = this.decodeMuLaw(mulawBuffer);
|
||||
|
||||
// 8kHz -> 24kHz
|
||||
const pcm16_24k = this.resample8kTo24k(pcm16_8k);
|
||||
|
||||
// Encode to base64
|
||||
return pcm16_24k.toString('base64');
|
||||
} catch (error) {
|
||||
this.logger.error('Error converting Twilio to OpenAI audio', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert OpenAI PCM16 24kHz to Twilio μ-law 8kHz
|
||||
* @param openaiBase64 - Base64-encoded PCM16 24kHz audio from OpenAI
|
||||
* @returns Base64-encoded μ-law 8kHz audio for Twilio
|
||||
*/
|
||||
openAIToTwilio(openaiBase64: string): string {
|
||||
try {
|
||||
// Decode base64
|
||||
const pcm16_24k = Buffer.from(openaiBase64, 'base64');
|
||||
|
||||
// 24kHz -> 8kHz
|
||||
const pcm16_8k = this.resample24kTo8k(pcm16_24k);
|
||||
|
||||
// PCM16 -> μ-law
|
||||
const mulawBuffer = this.encodeMuLaw(pcm16_8k);
|
||||
|
||||
// Encode to base64
|
||||
return mulawBuffer.toString('base64');
|
||||
} catch (error) {
|
||||
this.logger.error('Error converting OpenAI to Twilio audio', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,7 @@ import { FastifyRequest, FastifyReply } from 'fastify';
|
||||
import { JwtAuthGuard } from '../auth/jwt-auth.guard';
|
||||
import { VoiceService } from './voice.service';
|
||||
import { VoiceGateway } from './voice.gateway';
|
||||
import { AudioConverterService } from './audio-converter.service';
|
||||
import { InitiateCallDto } from './dto/initiate-call.dto';
|
||||
import { TenantId } from '../tenant/tenant.decorator';
|
||||
|
||||
@@ -20,9 +21,13 @@ import { TenantId } from '../tenant/tenant.decorator';
|
||||
export class VoiceController {
|
||||
private readonly logger = new Logger(VoiceController.name);
|
||||
|
||||
// Track active Media Streams connections: streamSid -> WebSocket
|
||||
private mediaStreams: Map<string, any> = new Map();
|
||||
|
||||
constructor(
|
||||
private readonly voiceService: VoiceService,
|
||||
private readonly voiceGateway: VoiceGateway,
|
||||
private readonly audioConverter: AudioConverterService,
|
||||
) {}
|
||||
|
||||
/**
|
||||
@@ -159,8 +164,12 @@ export class VoiceController {
|
||||
const fromNumber = body.From;
|
||||
const toNumber = body.To;
|
||||
|
||||
this.logger.log(`=== INBOUND CALL RECEIVED ===`);
|
||||
this.logger.log(`CallSid: ${callSid}, From: ${fromNumber}, To: ${toNumber}`);
|
||||
this.logger.log(`\n\n╔════════════════════════════════════════╗`);
|
||||
this.logger.log(`║ === INBOUND CALL RECEIVED ===`);
|
||||
this.logger.log(`╚════════════════════════════════════════╝`);
|
||||
this.logger.log(`CallSid: ${callSid}`);
|
||||
this.logger.log(`From: ${fromNumber}`);
|
||||
this.logger.log(`To: ${toNumber}`);
|
||||
this.logger.log(`Full body: ${JSON.stringify(body)}`);
|
||||
|
||||
try {
|
||||
@@ -174,6 +183,9 @@ export class VoiceController {
|
||||
const connectedUsers = this.voiceGateway.getConnectedUsers(tenantDomain);
|
||||
|
||||
this.logger.log(`Connected users for tenant ${tenantDomain}: ${connectedUsers.length}`);
|
||||
if (connectedUsers.length > 0) {
|
||||
this.logger.log(`Connected user IDs: ${connectedUsers.join(', ')}`);
|
||||
}
|
||||
|
||||
if (connectedUsers.length === 0) {
|
||||
// No users online - send to voicemail or play message
|
||||
@@ -182,22 +194,52 @@ export class VoiceController {
|
||||
<Say>Sorry, no agents are currently available. Please try again later.</Say>
|
||||
<Hangup/>
|
||||
</Response>`;
|
||||
this.logger.log(`No users online - returning unavailable message`);
|
||||
this.logger.log(`❌ No users online - returning unavailable message`);
|
||||
return res.type('text/xml').send(twiml);
|
||||
}
|
||||
|
||||
// Build TwiML to dial all connected clients (first to answer gets the call)
|
||||
// Build TwiML to dial all connected clients with Media Streams for AI
|
||||
const clientElements = connectedUsers.map(userId => ` <Client>${userId}</Client>`).join('\n');
|
||||
|
||||
// Use wss:// for secure WebSocket (Traefik handles HTTPS)
|
||||
const streamUrl = `wss://${host}/api/voice/media-stream`;
|
||||
|
||||
this.logger.log(`Stream URL: ${streamUrl}`);
|
||||
this.logger.log(`Dialing ${connectedUsers.length} client(s)...`);
|
||||
this.logger.log(`Client IDs to dial: ${connectedUsers.join(', ')}`);
|
||||
|
||||
// Verify we have client IDs in proper format
|
||||
if (connectedUsers.length > 0) {
|
||||
this.logger.log(`First Client ID format check: "${connectedUsers[0]}" (length: ${connectedUsers[0].length})`);
|
||||
}
|
||||
|
||||
// Notify connected users about incoming call via Socket.IO
|
||||
connectedUsers.forEach(userId => {
|
||||
this.voiceGateway.notifyIncomingCall(userId, {
|
||||
callSid,
|
||||
fromNumber,
|
||||
toNumber,
|
||||
tenantDomain,
|
||||
});
|
||||
});
|
||||
|
||||
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<Response>
|
||||
<Say>Connecting your call</Say>
|
||||
<Dial timeout="30" action="${host}/api/voice/webhook/dial-status">
|
||||
<Dial timeout="30">
|
||||
${clientElements}
|
||||
<OnAnswer>
|
||||
<Connect>
|
||||
<Stream url="${streamUrl}">
|
||||
<Parameter name="tenantId" value="${tenantDomain}"/>
|
||||
<Parameter name="userId" value="${connectedUsers[0]}"/>
|
||||
</Stream>
|
||||
</Connect>
|
||||
</OnAnswer>
|
||||
</Dial>
|
||||
</Response>`;
|
||||
|
||||
this.logger.log(`Returning inbound TwiML - dialing ${connectedUsers.length} client(s)`);
|
||||
this.logger.log(`✓ Returning inbound TwiML with Media Streams - dialing ${connectedUsers.length} client(s)`);
|
||||
this.logger.log(`Generated TwiML:\n${twiml}\n`);
|
||||
res.type('text/xml').send(twiml);
|
||||
} catch (error: any) {
|
||||
this.logger.error(`Error generating inbound TwiML: ${error.message}`);
|
||||
@@ -240,4 +282,216 @@ ${clientElements}
|
||||
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
/**
|
||||
* Twilio Media Streams WebSocket endpoint
|
||||
* Receives real-time audio from Twilio and forwards to OpenAI Realtime API
|
||||
*
|
||||
* This handles the HTTP GET request and upgrades it to WebSocket manually.
|
||||
*/
|
||||
@Get('media-stream')
|
||||
mediaStream(@Req() req: FastifyRequest) {
|
||||
// For WebSocket upgrade, we need to access the raw socket
|
||||
let socket: any;
|
||||
|
||||
try {
|
||||
this.logger.log(`=== MEDIA STREAM REQUEST ===`);
|
||||
this.logger.log(`URL: ${req.url}`);
|
||||
this.logger.log(`Headers keys: ${Object.keys(req.headers).join(', ')}`);
|
||||
this.logger.log(`Headers: ${JSON.stringify(req.headers)}`);
|
||||
|
||||
// Check if this is a WebSocket upgrade request
|
||||
const hasWebSocketKey = 'sec-websocket-key' in req.headers;
|
||||
const hasWebSocketVersion = 'sec-websocket-version' in req.headers;
|
||||
|
||||
this.logger.log(`hasWebSocketKey: ${hasWebSocketKey}`);
|
||||
this.logger.log(`hasWebSocketVersion: ${hasWebSocketVersion}`);
|
||||
|
||||
if (!hasWebSocketKey || !hasWebSocketVersion) {
|
||||
this.logger.log('Not a WebSocket upgrade request - returning');
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.log('✓ WebSocket upgrade detected');
|
||||
|
||||
// Get the socket - try different ways
|
||||
socket = (req.raw as any).socket;
|
||||
this.logger.log(`Socket obtained: ${!!socket}`);
|
||||
|
||||
if (!socket) {
|
||||
this.logger.error('Failed to get socket from req.raw');
|
||||
return;
|
||||
}
|
||||
|
||||
const rawRequest = req.raw;
|
||||
const head = Buffer.alloc(0);
|
||||
|
||||
this.logger.log('Creating WebSocketServer...');
|
||||
const WebSocketServer = require('ws').Server;
|
||||
const wss = new WebSocketServer({ noServer: true });
|
||||
|
||||
this.logger.log('Calling handleUpgrade...');
|
||||
|
||||
// handleUpgrade will send the 101 response and take over the socket
|
||||
wss.handleUpgrade(rawRequest, socket, head, (ws: any) => {
|
||||
this.logger.log('=== TWILIO MEDIA STREAM WEBSOCKET UPGRADED SUCCESSFULLY ===');
|
||||
this.handleMediaStreamSocket(ws);
|
||||
});
|
||||
|
||||
this.logger.log('handleUpgrade completed');
|
||||
} catch (error: any) {
|
||||
this.logger.error(`=== FAILED TO UPGRADE TO WEBSOCKET ===`);
|
||||
this.logger.error(`Error message: ${error.message}`);
|
||||
this.logger.error(`Error stack: ${error.stack}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle incoming Media Stream WebSocket messages
|
||||
*/
|
||||
private handleMediaStreamSocket(ws: any) {
|
||||
let streamSid: string | null = null;
|
||||
let callSid: string | null = null;
|
||||
let tenantDomain: string | null = null;
|
||||
let mediaPacketCount = 0;
|
||||
|
||||
// WebSocket message handler
|
||||
ws.on('message', async (message: Buffer) => {
|
||||
try {
|
||||
const msg = JSON.parse(message.toString());
|
||||
|
||||
switch (msg.event) {
|
||||
case 'connected':
|
||||
this.logger.log('=== MEDIA STREAM EVENT: CONNECTED ===');
|
||||
this.logger.log(`Protocol: ${msg.protocol}`);
|
||||
this.logger.log(`Version: ${msg.version}`);
|
||||
break;
|
||||
|
||||
case 'start':
|
||||
streamSid = msg.streamSid;
|
||||
callSid = msg.start.callSid;
|
||||
|
||||
// Extract tenant from customParameters if available
|
||||
tenantDomain = msg.start.customParameters?.tenantId || 'tenant1';
|
||||
|
||||
this.logger.log(`=== MEDIA STREAM EVENT: START ===`);
|
||||
this.logger.log(`StreamSid: ${streamSid}`);
|
||||
this.logger.log(`CallSid: ${callSid}`);
|
||||
this.logger.log(`Tenant: ${tenantDomain}`);
|
||||
this.logger.log(`AccountSid: ${msg.start.accountSid}`);
|
||||
this.logger.log(`MediaFormat: ${JSON.stringify(msg.start.mediaFormat)}`);
|
||||
this.logger.log(`Custom Parameters: ${JSON.stringify(msg.start.customParameters)}`);
|
||||
|
||||
// Store WebSocket connection
|
||||
this.mediaStreams.set(streamSid, ws);
|
||||
this.logger.log(`Stored WebSocket for streamSid: ${streamSid}. Total active streams: ${this.mediaStreams.size}`);
|
||||
|
||||
// Initialize OpenAI Realtime connection for this call
|
||||
this.logger.log(`Initializing OpenAI Realtime for call ${callSid}...`);
|
||||
await this.voiceService.initializeOpenAIRealtime({
|
||||
callSid,
|
||||
tenantId: tenantDomain,
|
||||
userId: msg.start.customParameters?.userId || 'system',
|
||||
});
|
||||
|
||||
this.logger.log(`✓ OpenAI Realtime initialized for call ${callSid}`);
|
||||
break;
|
||||
|
||||
case 'media':
|
||||
mediaPacketCount++;
|
||||
if (mediaPacketCount % 50 === 0) {
|
||||
// Log every 50th packet to avoid spam
|
||||
this.logger.log(`Received media packet #${mediaPacketCount} for StreamSid: ${streamSid}, CallSid: ${callSid}, PayloadSize: ${msg.media.payload?.length || 0} bytes`);
|
||||
}
|
||||
|
||||
if (!callSid || !tenantDomain) {
|
||||
this.logger.warn('Received media before start event');
|
||||
break;
|
||||
}
|
||||
|
||||
// msg.media.payload is base64-encoded μ-law audio from Twilio
|
||||
const twilioAudio = msg.media.payload;
|
||||
|
||||
// Convert Twilio audio (μ-law 8kHz) to OpenAI format (PCM16 24kHz)
|
||||
const openaiAudio = this.audioConverter.twilioToOpenAI(twilioAudio);
|
||||
|
||||
// Send audio to OpenAI Realtime API
|
||||
await this.voiceService.sendAudioToOpenAI(callSid, openaiAudio);
|
||||
break;
|
||||
|
||||
case 'stop':
|
||||
this.logger.log(`=== MEDIA STREAM EVENT: STOP ===`);
|
||||
this.logger.log(`StreamSid: ${streamSid}`);
|
||||
this.logger.log(`Total media packets received: ${mediaPacketCount}`);
|
||||
|
||||
if (streamSid) {
|
||||
this.mediaStreams.delete(streamSid);
|
||||
this.logger.log(`Removed WebSocket for streamSid: ${streamSid}. Remaining active streams: ${this.mediaStreams.size}`);
|
||||
}
|
||||
|
||||
// Clean up OpenAI connection
|
||||
if (callSid) {
|
||||
this.logger.log(`Cleaning up OpenAI connection for call ${callSid}...`);
|
||||
await this.voiceService.cleanupOpenAIConnection(callSid);
|
||||
this.logger.log(`✓ OpenAI connection cleaned up for call ${callSid}`);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
this.logger.debug(`Unknown media stream event: ${msg.event}`);
|
||||
}
|
||||
} catch (error: any) {
|
||||
this.logger.error(`Error processing media stream message: ${error.message}`);
|
||||
this.logger.error(`Stack: ${error.stack}`);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('close', () => {
|
||||
this.logger.log(`=== MEDIA STREAM WEBSOCKET CLOSED ===`);
|
||||
this.logger.log(`StreamSid: ${streamSid}`);
|
||||
this.logger.log(`Total media packets in this stream: ${mediaPacketCount}`);
|
||||
if (streamSid) {
|
||||
this.mediaStreams.delete(streamSid);
|
||||
this.logger.log(`Cleaned up streamSid on close. Remaining active streams: ${this.mediaStreams.size}`);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (error: Error) => {
|
||||
this.logger.error(`=== MEDIA STREAM WEBSOCKET ERROR ===`);
|
||||
this.logger.error(`StreamSid: ${streamSid}`);
|
||||
this.logger.error(`Error message: ${error.message}`);
|
||||
this.logger.error(`Error stack: ${error.stack}`);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Send audio from OpenAI back to Twilio Media Stream
|
||||
*/
|
||||
async sendAudioToTwilio(streamSid: string, openaiAudioBase64: string) {
|
||||
const ws = this.mediaStreams.get(streamSid);
|
||||
|
||||
if (!ws) {
|
||||
this.logger.warn(`No Media Stream found for streamSid: ${streamSid}`);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Convert OpenAI audio (PCM16 24kHz) to Twilio format (μ-law 8kHz)
|
||||
const twilioAudio = this.audioConverter.openAIToTwilio(openaiAudioBase64);
|
||||
|
||||
// Send to Twilio Media Stream
|
||||
const message = {
|
||||
event: 'media',
|
||||
streamSid,
|
||||
media: {
|
||||
payload: twilioAudio,
|
||||
},
|
||||
};
|
||||
|
||||
ws.send(JSON.stringify(message));
|
||||
} catch (error: any) {
|
||||
this.logger.error(`Error sending audio to Twilio: ${error.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -40,7 +40,10 @@ export class VoiceGateway
|
||||
private readonly jwtService: JwtService,
|
||||
private readonly voiceService: VoiceService,
|
||||
private readonly tenantDbService: TenantDatabaseService,
|
||||
) {}
|
||||
) {
|
||||
// Set gateway reference in service to avoid circular dependency
|
||||
this.voiceService.setGateway(this);
|
||||
}
|
||||
|
||||
async handleConnection(client: AuthenticatedSocket) {
|
||||
try {
|
||||
@@ -49,7 +52,7 @@ export class VoiceGateway
|
||||
client.handshake.auth.token || client.handshake.headers.authorization?.split(' ')[1];
|
||||
|
||||
if (!token) {
|
||||
this.logger.warn('Client connection rejected: No token provided');
|
||||
this.logger.warn('❌ Client connection rejected: No token provided');
|
||||
client.disconnect();
|
||||
return;
|
||||
}
|
||||
@@ -82,8 +85,9 @@ export class VoiceGateway
|
||||
|
||||
this.connectedUsers.set(client.userId, client);
|
||||
this.logger.log(
|
||||
`Client connected: ${client.id} (User: ${client.userId}, Domain: ${domain})`,
|
||||
`✓ Client connected: ${client.id} (User: ${client.userId}, Domain: ${domain})`,
|
||||
);
|
||||
this.logger.log(`Total connected users in ${domain}: ${this.getConnectedUsers(domain).length}`);
|
||||
|
||||
// Send current call state if any active call
|
||||
const activeCallSid = this.activeCallsByUser.get(client.userId);
|
||||
@@ -95,7 +99,7 @@ export class VoiceGateway
|
||||
client.emit('call:state', callState);
|
||||
}
|
||||
} catch (error) {
|
||||
this.logger.error('Authentication failed', error);
|
||||
this.logger.error('❌ Authentication failed', error);
|
||||
client.disconnect();
|
||||
}
|
||||
}
|
||||
@@ -103,7 +107,8 @@ export class VoiceGateway
|
||||
handleDisconnect(client: AuthenticatedSocket) {
|
||||
if (client.userId) {
|
||||
this.connectedUsers.delete(client.userId);
|
||||
this.logger.log(`Client disconnected: ${client.id} (User: ${client.userId})`);
|
||||
this.logger.log(`✓ Client disconnected: ${client.id} (User: ${client.userId})`);
|
||||
this.logger.log(`Remaining connected users: ${this.connectedUsers.size}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ import { JwtModule } from '@nestjs/jwt';
|
||||
import { VoiceGateway } from './voice.gateway';
|
||||
import { VoiceService } from './voice.service';
|
||||
import { VoiceController } from './voice.controller';
|
||||
import { AudioConverterService } from './audio-converter.service';
|
||||
import { TenantModule } from '../tenant/tenant.module';
|
||||
import { AuthModule } from '../auth/auth.module';
|
||||
|
||||
@@ -15,7 +16,7 @@ import { AuthModule } from '../auth/auth.module';
|
||||
signOptions: { expiresIn: process.env.JWT_EXPIRES_IN || '24h' },
|
||||
}),
|
||||
],
|
||||
providers: [VoiceGateway, VoiceService],
|
||||
providers: [VoiceGateway, VoiceService, AudioConverterService],
|
||||
controllers: [VoiceController],
|
||||
exports: [VoiceService],
|
||||
})
|
||||
|
||||
@@ -15,11 +15,19 @@ export class VoiceService {
|
||||
private twilioClients: Map<string, Twilio.Twilio> = new Map();
|
||||
private openaiConnections: Map<string, WebSocket> = new Map(); // callSid -> WebSocket
|
||||
private callStates: Map<string, any> = new Map(); // callSid -> call state
|
||||
private voiceGateway: any; // Reference to gateway (to avoid circular dependency)
|
||||
|
||||
constructor(
|
||||
private readonly tenantDbService: TenantDatabaseService,
|
||||
) {}
|
||||
|
||||
/**
|
||||
* Set gateway reference (called by gateway on init)
|
||||
*/
|
||||
setGateway(gateway: any) {
|
||||
this.voiceGateway = gateway;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Twilio client for a tenant
|
||||
*/
|
||||
@@ -487,6 +495,64 @@ export class VoiceService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Send audio data to OpenAI Realtime API
|
||||
*/
|
||||
async sendAudioToOpenAI(callSid: string, audioBase64: string) {
|
||||
const ws = this.openaiConnections.get(callSid);
|
||||
|
||||
if (!ws) {
|
||||
this.logger.warn(`No OpenAI connection for call ${callSid}`);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Send audio chunk to OpenAI
|
||||
ws.send(JSON.stringify({
|
||||
type: 'input_audio_buffer.append',
|
||||
audio: audioBase64,
|
||||
}));
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to send audio to OpenAI for call ${callSid}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Commit audio buffer to OpenAI (trigger processing)
|
||||
*/
|
||||
async commitAudioBuffer(callSid: string) {
|
||||
const ws = this.openaiConnections.get(callSid);
|
||||
|
||||
if (!ws) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
ws.send(JSON.stringify({
|
||||
type: 'input_audio_buffer.commit',
|
||||
}));
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to commit audio buffer for call ${callSid}`, error);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up OpenAI connection for a call
|
||||
*/
|
||||
async cleanupOpenAIConnection(callSid: string) {
|
||||
const ws = this.openaiConnections.get(callSid);
|
||||
|
||||
if (ws) {
|
||||
try {
|
||||
ws.close();
|
||||
this.openaiConnections.delete(callSid);
|
||||
this.logger.log(`Cleaned up OpenAI connection for call ${callSid}`);
|
||||
} catch (error) {
|
||||
this.logger.error(`Error cleaning up OpenAI connection for call ${callSid}`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle OpenAI Realtime messages
|
||||
*/
|
||||
@@ -505,9 +571,40 @@ export class VoiceService {
|
||||
}
|
||||
break;
|
||||
|
||||
case 'response.audio.delta':
|
||||
// OpenAI is sending audio response
|
||||
// This needs to be sent to Twilio Media Stream
|
||||
// Note: We'll need to get the streamSid from the call state
|
||||
const state = this.callStates.get(callSid);
|
||||
if (state?.streamSid && message.delta) {
|
||||
// The controller will handle sending to Twilio
|
||||
// Store audio delta for controller to pick up
|
||||
if (!state.pendingAudio) {
|
||||
state.pendingAudio = [];
|
||||
}
|
||||
state.pendingAudio.push(message.delta);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'response.audio.done':
|
||||
// Audio response complete
|
||||
this.logger.log(`OpenAI audio response complete for call ${callSid}`);
|
||||
break;
|
||||
|
||||
case 'response.audio_transcript.delta':
|
||||
// Real-time transcript
|
||||
// TODO: Emit to gateway
|
||||
// Real-time transcript chunk
|
||||
const deltaState = this.callStates.get(callSid);
|
||||
if (deltaState?.userId && message.delta) {
|
||||
// Emit to frontend via gateway
|
||||
if (this.voiceGateway) {
|
||||
await this.voiceGateway.notifyAiTranscript(deltaState.userId, {
|
||||
callSid,
|
||||
transcript: message.delta,
|
||||
isFinal: false,
|
||||
});
|
||||
}
|
||||
this.logger.debug(`Transcript delta for call ${callSid}: ${message.delta}`);
|
||||
}
|
||||
break;
|
||||
|
||||
case 'response.audio_transcript.done':
|
||||
|
||||
Reference in New Issue
Block a user