Add twilio softphone with integrated AI assistant
This commit is contained in:
214
backend/src/voice/audio-converter.service.ts
Normal file
214
backend/src/voice/audio-converter.service.ts
Normal file
@@ -0,0 +1,214 @@
|
||||
import { Injectable, Logger } from '@nestjs/common';
|
||||
|
||||
/**
|
||||
* Audio format converter for Twilio <-> OpenAI audio streaming
|
||||
*
|
||||
* Twilio Media Streams format:
|
||||
* - Codec: μ-law (G.711)
|
||||
* - Sample rate: 8kHz
|
||||
* - Encoding: base64
|
||||
* - Chunk size: 20ms (160 bytes)
|
||||
*
|
||||
* OpenAI Realtime API format:
|
||||
* - Codec: PCM16
|
||||
* - Sample rate: 24kHz
|
||||
* - Encoding: base64
|
||||
* - Mono channel
|
||||
*/
|
||||
@Injectable()
|
||||
export class AudioConverterService {
|
||||
private readonly logger = new Logger(AudioConverterService.name);
|
||||
|
||||
// μ-law decode lookup table
|
||||
private readonly MULAW_DECODE_TABLE = this.buildMuLawDecodeTable();
|
||||
|
||||
// μ-law encode lookup table
|
||||
private readonly MULAW_ENCODE_TABLE = this.buildMuLawEncodeTable();
|
||||
|
||||
/**
|
||||
* Build μ-law to linear PCM16 decode table
|
||||
*/
|
||||
private buildMuLawDecodeTable(): Int16Array {
|
||||
const table = new Int16Array(256);
|
||||
for (let i = 0; i < 256; i++) {
|
||||
const mulaw = ~i;
|
||||
const exponent = (mulaw >> 4) & 0x07;
|
||||
const mantissa = mulaw & 0x0f;
|
||||
let sample = (mantissa << 3) + 0x84;
|
||||
sample <<= exponent;
|
||||
sample -= 0x84;
|
||||
if ((mulaw & 0x80) === 0) {
|
||||
sample = -sample;
|
||||
}
|
||||
table[i] = sample;
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build linear PCM16 to μ-law encode table
|
||||
*/
|
||||
private buildMuLawEncodeTable(): Uint8Array {
|
||||
const table = new Uint8Array(65536);
|
||||
for (let i = 0; i < 65536; i++) {
|
||||
const sample = (i - 32768);
|
||||
const sign = sample < 0 ? 0x80 : 0x00;
|
||||
const magnitude = Math.abs(sample);
|
||||
|
||||
// Add bias
|
||||
let biased = magnitude + 0x84;
|
||||
|
||||
// Find exponent
|
||||
let exponent = 7;
|
||||
for (let exp = 0; exp < 8; exp++) {
|
||||
if (biased <= (0xff << exp)) {
|
||||
exponent = exp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract mantissa
|
||||
const mantissa = (biased >> (exponent + 3)) & 0x0f;
|
||||
|
||||
// Combine sign, exponent, mantissa
|
||||
const mulaw = ~(sign | (exponent << 4) | mantissa);
|
||||
table[i] = mulaw & 0xff;
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode μ-law audio to linear PCM16
|
||||
* @param mulawData - Buffer containing μ-law encoded audio
|
||||
* @returns Buffer containing PCM16 audio (16-bit little-endian)
|
||||
*/
|
||||
decodeMuLaw(mulawData: Buffer): Buffer {
|
||||
const pcm16 = Buffer.allocUnsafe(mulawData.length * 2);
|
||||
|
||||
for (let i = 0; i < mulawData.length; i++) {
|
||||
const sample = this.MULAW_DECODE_TABLE[mulawData[i]];
|
||||
pcm16.writeInt16LE(sample, i * 2);
|
||||
}
|
||||
|
||||
return pcm16;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode linear PCM16 to μ-law
|
||||
* @param pcm16Data - Buffer containing PCM16 audio (16-bit little-endian)
|
||||
* @returns Buffer containing μ-law encoded audio
|
||||
*/
|
||||
encodeMuLaw(pcm16Data: Buffer): Buffer {
|
||||
const mulaw = Buffer.allocUnsafe(pcm16Data.length / 2);
|
||||
|
||||
for (let i = 0; i < pcm16Data.length; i += 2) {
|
||||
const sample = pcm16Data.readInt16LE(i);
|
||||
const index = (sample + 32768) & 0xffff;
|
||||
mulaw[i / 2] = this.MULAW_ENCODE_TABLE[index];
|
||||
}
|
||||
|
||||
return mulaw;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample audio from 8kHz to 24kHz (linear interpolation)
|
||||
* @param pcm16Data - Buffer containing 8kHz PCM16 audio
|
||||
* @returns Buffer containing 24kHz PCM16 audio
|
||||
*/
|
||||
resample8kTo24k(pcm16Data: Buffer): Buffer {
|
||||
const inputSamples = pcm16Data.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples * 3); // 8k * 3 = 24k
|
||||
const output = Buffer.allocUnsafe(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
const srcIndex = i / 3;
|
||||
const srcIndexFloor = Math.floor(srcIndex);
|
||||
const srcIndexCeil = Math.min(srcIndexFloor + 1, inputSamples - 1);
|
||||
const fraction = srcIndex - srcIndexFloor;
|
||||
|
||||
const sample1 = pcm16Data.readInt16LE(srcIndexFloor * 2);
|
||||
const sample2 = pcm16Data.readInt16LE(srcIndexCeil * 2);
|
||||
|
||||
// Linear interpolation
|
||||
const interpolated = Math.round(sample1 + (sample2 - sample1) * fraction);
|
||||
output.writeInt16LE(interpolated, i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resample audio from 24kHz to 8kHz (decimation with averaging)
|
||||
* @param pcm16Data - Buffer containing 24kHz PCM16 audio
|
||||
* @returns Buffer containing 8kHz PCM16 audio
|
||||
*/
|
||||
resample24kTo8k(pcm16Data: Buffer): Buffer {
|
||||
const inputSamples = pcm16Data.length / 2;
|
||||
const outputSamples = Math.floor(inputSamples / 3); // 24k / 3 = 8k
|
||||
const output = Buffer.allocUnsafe(outputSamples * 2);
|
||||
|
||||
for (let i = 0; i < outputSamples; i++) {
|
||||
// Average 3 samples for anti-aliasing
|
||||
const idx1 = Math.min(i * 3, inputSamples - 1);
|
||||
const idx2 = Math.min(i * 3 + 1, inputSamples - 1);
|
||||
const idx3 = Math.min(i * 3 + 2, inputSamples - 1);
|
||||
|
||||
const sample1 = pcm16Data.readInt16LE(idx1 * 2);
|
||||
const sample2 = pcm16Data.readInt16LE(idx2 * 2);
|
||||
const sample3 = pcm16Data.readInt16LE(idx3 * 2);
|
||||
|
||||
const averaged = Math.round((sample1 + sample2 + sample3) / 3);
|
||||
output.writeInt16LE(averaged, i * 2);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Twilio μ-law 8kHz to OpenAI PCM16 24kHz
|
||||
* @param twilioBase64 - Base64-encoded μ-law audio from Twilio
|
||||
* @returns Base64-encoded PCM16 24kHz audio for OpenAI
|
||||
*/
|
||||
twilioToOpenAI(twilioBase64: string): string {
|
||||
try {
|
||||
// Decode base64
|
||||
const mulawBuffer = Buffer.from(twilioBase64, 'base64');
|
||||
|
||||
// μ-law -> PCM16
|
||||
const pcm16_8k = this.decodeMuLaw(mulawBuffer);
|
||||
|
||||
// 8kHz -> 24kHz
|
||||
const pcm16_24k = this.resample8kTo24k(pcm16_8k);
|
||||
|
||||
// Encode to base64
|
||||
return pcm16_24k.toString('base64');
|
||||
} catch (error) {
|
||||
this.logger.error('Error converting Twilio to OpenAI audio', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert OpenAI PCM16 24kHz to Twilio μ-law 8kHz
|
||||
* @param openaiBase64 - Base64-encoded PCM16 24kHz audio from OpenAI
|
||||
* @returns Base64-encoded μ-law 8kHz audio for Twilio
|
||||
*/
|
||||
openAIToTwilio(openaiBase64: string): string {
|
||||
try {
|
||||
// Decode base64
|
||||
const pcm16_24k = Buffer.from(openaiBase64, 'base64');
|
||||
|
||||
// 24kHz -> 8kHz
|
||||
const pcm16_8k = this.resample24kTo8k(pcm16_24k);
|
||||
|
||||
// PCM16 -> μ-law
|
||||
const mulawBuffer = this.encodeMuLaw(pcm16_8k);
|
||||
|
||||
// Encode to base64
|
||||
return mulawBuffer.toString('base64');
|
||||
} catch (error) {
|
||||
this.logger.error('Error converting OpenAI to Twilio audio', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user