Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter

2026-05-13 02:36:46 +02:00
parent fc02d2001c
commit 19975917c5
6 changed files with 20 additions and 5 deletions
--- a/src/cli/args.ts
+++ b/src/cli/args.ts
@@ -13,6 +13,7 @@ export interface CLIArgs {
  ttsModel?: string;
  ttsVoice?: string;
  ttsSpeedFactor?: number;
+  ttsInstructions?: string;
  outputDir?: string;
  tempDir?: string;
  batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
      describe: 'Speed factor for the audio playback',
      type: 'number'
    })
+    .option('ttsInstructions', {
+      describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
+      type: 'string'
+    })
    .option('outputDir', {
      alias: 'o',
      describe: 'Directory for output files',
--- a/src/config/config.ts
+++ b/src/config/config.ts
@@ -18,6 +18,7 @@ export interface Config {
  ttsProvider: string;
  ttsVoice: string;
  ttsSpeedFactor: number;
+  ttsInstructions?: string;
  ttsProviders: {
    [key: string]: TTSProviderConfig;
  };
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
    ttsProvider: "openai",
    ttsVoice: "alloy",
    ttsSpeedFactor: 1.5,
+    ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
    ttsProviders: {
      openai: {
        apiKey: process.env.OPENAI_API_KEY,
-        model: "tts-1-hd",
+        model: "gpt-4o-mini-tts",
        voice: "alloy"
      }
    },
--- a/src/index.ts
+++ b/src/index.ts
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
    config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
  }

+  if (argv.ttsInstructions) {
+    config.ttsInstructions = argv.ttsInstructions;
+  }
+
  if (argv.saveConfig) {
    saveConfigToFile(argv.saveConfig, config);
  }
--- a/src/interfaces/index.ts
+++ b/src/interfaces/index.ts
@@ -39,6 +39,7 @@ export interface TTSOptions {
  voice?: string;
  model?: string;
  speedFactor?: number;
+  instructions?: string;
 }

 export interface TTSProviderConfig {
--- a/src/providers/tts/openAITTSProvider.ts
+++ b/src/providers/tts/openAITTSProvider.ts
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {

      const mp3 = await this.openai.audio.speech.create({
        model: model,
-        voice: voice as any, // Type casting to any to avoid type issues
-        input: text
+        voice: voice as any,
+        input: text,
+        ...(options.instructions ? { instructions: options.instructions } : {})
      });

      // Cost calculation is based on character count
--- a/src/utils/processor.ts
+++ b/src/utils/processor.ts
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
      voice: settings.ttsVoice,
      model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
    });

    const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
      voice: settings.ttsVoice,
      model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
    });

    const audioDuration = ttsResult.duration;