aidio-description/src/providers/tts/openAITTSProvider.ts

import fs from 'fs';
import { execSync } from 'child_process';
import { OpenAI } from 'openai';
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
import { getAudioDuration } from '../../utils/mediaUtils';

/**
 * OpenAI TTS Provider Implementation
 */
export class OpenAITTSProvider implements TTSProvider {
  private config: TTSProviderConfig;
  private openai: OpenAI;

  constructor(config: TTSProviderConfig) {
    this.config = config;
    this.openai = new OpenAI({
      apiKey: config.apiKey,
    });
  }

  /**
   * Convert text to speech
   * @param text - Text to convert to speech
   * @param outputPath - Output path for the audio file
   * @param options - Additional options
   * @returns Duration of the generated audio in seconds and cost
   */
  async textToSpeech(
    text: string,
    outputPath: string,
    options: TTSOptions = {}
  ): Promise<TTSResult> {
    try {
      // Get the options, with defaults from config
      const voice = options.voice || this.config.voice;
      const model = options.model || this.config.model;
      const speedFactor = options.speedFactor || 1.0;

      // Generate the initial TTS output
      const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');

      const mp3 = await this.openai.audio.speech.create({
        model: model,
        voice: voice as any,
        input: text,
        ...(options.instructions ? { instructions: options.instructions } : {})
      });

      // Cost calculation is based on character count
      const cost = text.length;

      const buffer = Buffer.from(await mp3.arrayBuffer());
      fs.writeFileSync(tempOutputPath, buffer);

      // Speed up the audio using FFmpeg if needed
      if (speedFactor !== 1.0) {
        execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
        // Clean up temporary file
        fs.unlinkSync(tempOutputPath);
      } else {
        // Just use the file as is
        fs.renameSync(tempOutputPath, outputPath);
      }

      // Get actual audio duration for accurate timing
      const audioDuration = getAudioDuration(outputPath);

      return {
        duration: audioDuration,
        cost: cost
      };
    } catch (error) {
      console.error("Error generating speech:", error);
      // Create a silent audio file if TTS fails
      execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
      return {
        duration: 1,
        cost: 0
      };
    }
  }
}