import { Injectable, Logger } from '@nestjs/common'; /** * Audio format converter for Twilio <-> OpenAI audio streaming * * Twilio Media Streams format: * - Codec: μ-law (G.711) * - Sample rate: 8kHz * - Encoding: base64 * - Chunk size: 20ms (160 bytes) * * OpenAI Realtime API format: * - Codec: PCM16 * - Sample rate: 24kHz * - Encoding: base64 * - Mono channel */ @Injectable() export class AudioConverterService { private readonly logger = new Logger(AudioConverterService.name); // μ-law decode lookup table private readonly MULAW_DECODE_TABLE = this.buildMuLawDecodeTable(); // μ-law encode lookup table private readonly MULAW_ENCODE_TABLE = this.buildMuLawEncodeTable(); /** * Build μ-law to linear PCM16 decode table */ private buildMuLawDecodeTable(): Int16Array { const table = new Int16Array(256); for (let i = 0; i < 256; i++) { const mulaw = ~i; const exponent = (mulaw >> 4) & 0x07; const mantissa = mulaw & 0x0f; let sample = (mantissa << 3) + 0x84; sample <<= exponent; sample -= 0x84; if ((mulaw & 0x80) === 0) { sample = -sample; } table[i] = sample; } return table; } /** * Build linear PCM16 to μ-law encode table */ private buildMuLawEncodeTable(): Uint8Array { const table = new Uint8Array(65536); for (let i = 0; i < 65536; i++) { const sample = (i - 32768); const sign = sample < 0 ? 0x80 : 0x00; const magnitude = Math.abs(sample); // Add bias let biased = magnitude + 0x84; // Find exponent let exponent = 7; for (let exp = 0; exp < 8; exp++) { if (biased <= (0xff << exp)) { exponent = exp; break; } } // Extract mantissa const mantissa = (biased >> (exponent + 3)) & 0x0f; // Combine sign, exponent, mantissa const mulaw = ~(sign | (exponent << 4) | mantissa); table[i] = mulaw & 0xff; } return table; } /** * Decode μ-law audio to linear PCM16 * @param mulawData - Buffer containing μ-law encoded audio * @returns Buffer containing PCM16 audio (16-bit little-endian) */ decodeMuLaw(mulawData: Buffer): Buffer { const pcm16 = Buffer.allocUnsafe(mulawData.length * 2); for (let i = 0; i < mulawData.length; i++) { const sample = this.MULAW_DECODE_TABLE[mulawData[i]]; pcm16.writeInt16LE(sample, i * 2); } return pcm16; } /** * Encode linear PCM16 to μ-law * @param pcm16Data - Buffer containing PCM16 audio (16-bit little-endian) * @returns Buffer containing μ-law encoded audio */ encodeMuLaw(pcm16Data: Buffer): Buffer { const mulaw = Buffer.allocUnsafe(pcm16Data.length / 2); for (let i = 0; i < pcm16Data.length; i += 2) { const sample = pcm16Data.readInt16LE(i); const index = (sample + 32768) & 0xffff; mulaw[i / 2] = this.MULAW_ENCODE_TABLE[index]; } return mulaw; } /** * Resample audio from 8kHz to 24kHz (linear interpolation) * @param pcm16Data - Buffer containing 8kHz PCM16 audio * @returns Buffer containing 24kHz PCM16 audio */ resample8kTo24k(pcm16Data: Buffer): Buffer { const inputSamples = pcm16Data.length / 2; const outputSamples = Math.floor(inputSamples * 3); // 8k * 3 = 24k const output = Buffer.allocUnsafe(outputSamples * 2); for (let i = 0; i < outputSamples; i++) { const srcIndex = i / 3; const srcIndexFloor = Math.floor(srcIndex); const srcIndexCeil = Math.min(srcIndexFloor + 1, inputSamples - 1); const fraction = srcIndex - srcIndexFloor; const sample1 = pcm16Data.readInt16LE(srcIndexFloor * 2); const sample2 = pcm16Data.readInt16LE(srcIndexCeil * 2); // Linear interpolation const interpolated = Math.round(sample1 + (sample2 - sample1) * fraction); output.writeInt16LE(interpolated, i * 2); } return output; } /** * Resample audio from 24kHz to 8kHz (decimation with averaging) * @param pcm16Data - Buffer containing 24kHz PCM16 audio * @returns Buffer containing 8kHz PCM16 audio */ resample24kTo8k(pcm16Data: Buffer): Buffer { const inputSamples = pcm16Data.length / 2; const outputSamples = Math.floor(inputSamples / 3); // 24k / 3 = 8k const output = Buffer.allocUnsafe(outputSamples * 2); for (let i = 0; i < outputSamples; i++) { // Average 3 samples for anti-aliasing const idx1 = Math.min(i * 3, inputSamples - 1); const idx2 = Math.min(i * 3 + 1, inputSamples - 1); const idx3 = Math.min(i * 3 + 2, inputSamples - 1); const sample1 = pcm16Data.readInt16LE(idx1 * 2); const sample2 = pcm16Data.readInt16LE(idx2 * 2); const sample3 = pcm16Data.readInt16LE(idx3 * 2); const averaged = Math.round((sample1 + sample2 + sample3) / 3); output.writeInt16LE(averaged, i * 2); } return output; } /** * Convert Twilio μ-law 8kHz to OpenAI PCM16 24kHz * @param twilioBase64 - Base64-encoded μ-law audio from Twilio * @returns Base64-encoded PCM16 24kHz audio for OpenAI */ twilioToOpenAI(twilioBase64: string): string { try { // Decode base64 const mulawBuffer = Buffer.from(twilioBase64, 'base64'); // μ-law -> PCM16 const pcm16_8k = this.decodeMuLaw(mulawBuffer); // 8kHz -> 24kHz const pcm16_24k = this.resample8kTo24k(pcm16_8k); // Encode to base64 return pcm16_24k.toString('base64'); } catch (error) { this.logger.error('Error converting Twilio to OpenAI audio', error); throw error; } } /** * Convert OpenAI PCM16 24kHz to Twilio μ-law 8kHz * @param openaiBase64 - Base64-encoded PCM16 24kHz audio from OpenAI * @returns Base64-encoded μ-law 8kHz audio for Twilio */ openAIToTwilio(openaiBase64: string): string { try { // Decode base64 const pcm16_24k = Buffer.from(openaiBase64, 'base64'); // 24kHz -> 8kHz const pcm16_8k = this.resample24kTo8k(pcm16_24k); // PCM16 -> μ-law const mulawBuffer = this.encodeMuLaw(pcm16_8k); // Encode to base64 return mulawBuffer.toString('base64'); } catch (error) { this.logger.error('Error converting OpenAI to Twilio audio', error); throw error; } } }