215 lines
6.3 KiB
TypeScript
215 lines
6.3 KiB
TypeScript
import { Injectable, Logger } from '@nestjs/common';
|
|
|
|
/**
|
|
* Audio format converter for Twilio <-> OpenAI audio streaming
|
|
*
|
|
* Twilio Media Streams format:
|
|
* - Codec: μ-law (G.711)
|
|
* - Sample rate: 8kHz
|
|
* - Encoding: base64
|
|
* - Chunk size: 20ms (160 bytes)
|
|
*
|
|
* OpenAI Realtime API format:
|
|
* - Codec: PCM16
|
|
* - Sample rate: 24kHz
|
|
* - Encoding: base64
|
|
* - Mono channel
|
|
*/
|
|
@Injectable()
|
|
export class AudioConverterService {
|
|
private readonly logger = new Logger(AudioConverterService.name);
|
|
|
|
// μ-law decode lookup table
|
|
private readonly MULAW_DECODE_TABLE = this.buildMuLawDecodeTable();
|
|
|
|
// μ-law encode lookup table
|
|
private readonly MULAW_ENCODE_TABLE = this.buildMuLawEncodeTable();
|
|
|
|
/**
|
|
* Build μ-law to linear PCM16 decode table
|
|
*/
|
|
private buildMuLawDecodeTable(): Int16Array {
|
|
const table = new Int16Array(256);
|
|
for (let i = 0; i < 256; i++) {
|
|
const mulaw = ~i;
|
|
const exponent = (mulaw >> 4) & 0x07;
|
|
const mantissa = mulaw & 0x0f;
|
|
let sample = (mantissa << 3) + 0x84;
|
|
sample <<= exponent;
|
|
sample -= 0x84;
|
|
if ((mulaw & 0x80) === 0) {
|
|
sample = -sample;
|
|
}
|
|
table[i] = sample;
|
|
}
|
|
return table;
|
|
}
|
|
|
|
/**
|
|
* Build linear PCM16 to μ-law encode table
|
|
*/
|
|
private buildMuLawEncodeTable(): Uint8Array {
|
|
const table = new Uint8Array(65536);
|
|
for (let i = 0; i < 65536; i++) {
|
|
const sample = (i - 32768);
|
|
const sign = sample < 0 ? 0x80 : 0x00;
|
|
const magnitude = Math.abs(sample);
|
|
|
|
// Add bias
|
|
let biased = magnitude + 0x84;
|
|
|
|
// Find exponent
|
|
let exponent = 7;
|
|
for (let exp = 0; exp < 8; exp++) {
|
|
if (biased <= (0xff << exp)) {
|
|
exponent = exp;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract mantissa
|
|
const mantissa = (biased >> (exponent + 3)) & 0x0f;
|
|
|
|
// Combine sign, exponent, mantissa
|
|
const mulaw = ~(sign | (exponent << 4) | mantissa);
|
|
table[i] = mulaw & 0xff;
|
|
}
|
|
return table;
|
|
}
|
|
|
|
/**
|
|
* Decode μ-law audio to linear PCM16
|
|
* @param mulawData - Buffer containing μ-law encoded audio
|
|
* @returns Buffer containing PCM16 audio (16-bit little-endian)
|
|
*/
|
|
decodeMuLaw(mulawData: Buffer): Buffer {
|
|
const pcm16 = Buffer.allocUnsafe(mulawData.length * 2);
|
|
|
|
for (let i = 0; i < mulawData.length; i++) {
|
|
const sample = this.MULAW_DECODE_TABLE[mulawData[i]];
|
|
pcm16.writeInt16LE(sample, i * 2);
|
|
}
|
|
|
|
return pcm16;
|
|
}
|
|
|
|
/**
|
|
* Encode linear PCM16 to μ-law
|
|
* @param pcm16Data - Buffer containing PCM16 audio (16-bit little-endian)
|
|
* @returns Buffer containing μ-law encoded audio
|
|
*/
|
|
encodeMuLaw(pcm16Data: Buffer): Buffer {
|
|
const mulaw = Buffer.allocUnsafe(pcm16Data.length / 2);
|
|
|
|
for (let i = 0; i < pcm16Data.length; i += 2) {
|
|
const sample = pcm16Data.readInt16LE(i);
|
|
const index = (sample + 32768) & 0xffff;
|
|
mulaw[i / 2] = this.MULAW_ENCODE_TABLE[index];
|
|
}
|
|
|
|
return mulaw;
|
|
}
|
|
|
|
/**
|
|
* Resample audio from 8kHz to 24kHz (linear interpolation)
|
|
* @param pcm16Data - Buffer containing 8kHz PCM16 audio
|
|
* @returns Buffer containing 24kHz PCM16 audio
|
|
*/
|
|
resample8kTo24k(pcm16Data: Buffer): Buffer {
|
|
const inputSamples = pcm16Data.length / 2;
|
|
const outputSamples = Math.floor(inputSamples * 3); // 8k * 3 = 24k
|
|
const output = Buffer.allocUnsafe(outputSamples * 2);
|
|
|
|
for (let i = 0; i < outputSamples; i++) {
|
|
const srcIndex = i / 3;
|
|
const srcIndexFloor = Math.floor(srcIndex);
|
|
const srcIndexCeil = Math.min(srcIndexFloor + 1, inputSamples - 1);
|
|
const fraction = srcIndex - srcIndexFloor;
|
|
|
|
const sample1 = pcm16Data.readInt16LE(srcIndexFloor * 2);
|
|
const sample2 = pcm16Data.readInt16LE(srcIndexCeil * 2);
|
|
|
|
// Linear interpolation
|
|
const interpolated = Math.round(sample1 + (sample2 - sample1) * fraction);
|
|
output.writeInt16LE(interpolated, i * 2);
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
* Resample audio from 24kHz to 8kHz (decimation with averaging)
|
|
* @param pcm16Data - Buffer containing 24kHz PCM16 audio
|
|
* @returns Buffer containing 8kHz PCM16 audio
|
|
*/
|
|
resample24kTo8k(pcm16Data: Buffer): Buffer {
|
|
const inputSamples = pcm16Data.length / 2;
|
|
const outputSamples = Math.floor(inputSamples / 3); // 24k / 3 = 8k
|
|
const output = Buffer.allocUnsafe(outputSamples * 2);
|
|
|
|
for (let i = 0; i < outputSamples; i++) {
|
|
// Average 3 samples for anti-aliasing
|
|
const idx1 = Math.min(i * 3, inputSamples - 1);
|
|
const idx2 = Math.min(i * 3 + 1, inputSamples - 1);
|
|
const idx3 = Math.min(i * 3 + 2, inputSamples - 1);
|
|
|
|
const sample1 = pcm16Data.readInt16LE(idx1 * 2);
|
|
const sample2 = pcm16Data.readInt16LE(idx2 * 2);
|
|
const sample3 = pcm16Data.readInt16LE(idx3 * 2);
|
|
|
|
const averaged = Math.round((sample1 + sample2 + sample3) / 3);
|
|
output.writeInt16LE(averaged, i * 2);
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
* Convert Twilio μ-law 8kHz to OpenAI PCM16 24kHz
|
|
* @param twilioBase64 - Base64-encoded μ-law audio from Twilio
|
|
* @returns Base64-encoded PCM16 24kHz audio for OpenAI
|
|
*/
|
|
twilioToOpenAI(twilioBase64: string): string {
|
|
try {
|
|
// Decode base64
|
|
const mulawBuffer = Buffer.from(twilioBase64, 'base64');
|
|
|
|
// μ-law -> PCM16
|
|
const pcm16_8k = this.decodeMuLaw(mulawBuffer);
|
|
|
|
// 8kHz -> 24kHz
|
|
const pcm16_24k = this.resample8kTo24k(pcm16_8k);
|
|
|
|
// Encode to base64
|
|
return pcm16_24k.toString('base64');
|
|
} catch (error) {
|
|
this.logger.error('Error converting Twilio to OpenAI audio', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert OpenAI PCM16 24kHz to Twilio μ-law 8kHz
|
|
* @param openaiBase64 - Base64-encoded PCM16 24kHz audio from OpenAI
|
|
* @returns Base64-encoded μ-law 8kHz audio for Twilio
|
|
*/
|
|
openAIToTwilio(openaiBase64: string): string {
|
|
try {
|
|
// Decode base64
|
|
const pcm16_24k = Buffer.from(openaiBase64, 'base64');
|
|
|
|
// 24kHz -> 8kHz
|
|
const pcm16_8k = this.resample24kTo8k(pcm16_24k);
|
|
|
|
// PCM16 -> μ-law
|
|
const mulawBuffer = this.encodeMuLaw(pcm16_8k);
|
|
|
|
// Encode to base64
|
|
return mulawBuffer.toString('base64');
|
|
} catch (error) {
|
|
this.logger.error('Error converting OpenAI to Twilio audio', error);
|
|
throw error;
|
|
}
|
|
}
|
|
}
|