From 19975917c5f51e55d45613dda2e0a2160a6193ce Mon Sep 17 00:00:00 2001
From: Talon <talon@iamtalon.me>
Date: Wed, 13 May 2026 02:36:46 +0200
Subject: [PATCH] Enhance OpenAI TTS: add gpt-4o-mini-tts support with
 instructions parameter

---
 src/cli/args.ts                        | 5 +++++
 src/config/config.ts                   | 4 +++-
 src/index.ts                           | 4 ++++
 src/interfaces/index.ts                | 1 +
 src/providers/tts/openAITTSProvider.ts | 5 +++--
 src/utils/processor.ts                 | 6 ++++--
 6 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/src/cli/args.ts b/src/cli/args.ts
index 0c4ba58..00d8a10 100644
--- a/src/cli/args.ts
+++ b/src/cli/args.ts
@@ -13,6 +13,7 @@ export interface CLIArgs {
   ttsModel?: string;
   ttsVoice?: string;
   ttsSpeedFactor?: number;
+  ttsInstructions?: string;
   outputDir?: string;
   tempDir?: string;
   batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
       describe: 'Speed factor for the audio playback',
       type: 'number'
     })
+    .option('ttsInstructions', {
+      describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
+      type: 'string'
+    })
     .option('outputDir', {
       alias: 'o',
       describe: 'Directory for output files',
diff --git a/src/config/config.ts b/src/config/config.ts
index 895f4de..050af99 100644
--- a/src/config/config.ts
+++ b/src/config/config.ts
@@ -18,6 +18,7 @@ export interface Config {
   ttsProvider: string;
   ttsVoice: string;
   ttsSpeedFactor: number;
+  ttsInstructions?: string;
   ttsProviders: {
     [key: string]: TTSProviderConfig;
   };
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
     ttsProvider: "openai",
     ttsVoice: "alloy",
     ttsSpeedFactor: 1.5,
+    ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
     ttsProviders: {
       openai: {
         apiKey: process.env.OPENAI_API_KEY,
-        model: "tts-1-hd",
+        model: "gpt-4o-mini-tts",
         voice: "alloy"
       }
     },
diff --git a/src/index.ts b/src/index.ts
index f64b67d..9ff2ec3 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
     config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
   }
 
+  if (argv.ttsInstructions) {
+    config.ttsInstructions = argv.ttsInstructions;
+  }
+
   if (argv.saveConfig) {
     saveConfigToFile(argv.saveConfig, config);
   }
diff --git a/src/interfaces/index.ts b/src/interfaces/index.ts
index 69a3cb5..aa65179 100644
--- a/src/interfaces/index.ts
+++ b/src/interfaces/index.ts
@@ -39,6 +39,7 @@ export interface TTSOptions {
   voice?: string;
   model?: string;
   speedFactor?: number;
+  instructions?: string;
 }
 
 export interface TTSProviderConfig {
diff --git a/src/providers/tts/openAITTSProvider.ts b/src/providers/tts/openAITTSProvider.ts
index f8fdff8..42ac3fb 100644
--- a/src/providers/tts/openAITTSProvider.ts
+++ b/src/providers/tts/openAITTSProvider.ts
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
 
       const mp3 = await this.openai.audio.speech.create({
         model: model,
-        voice: voice as any, // Type casting to any to avoid type issues
-        input: text
+        voice: voice as any,
+        input: text,
+        ...(options.instructions ? { instructions: options.instructions } : {})
       });
 
       // Cost calculation is based on character count
diff --git a/src/utils/processor.ts b/src/utils/processor.ts
index 35dcacf..46b3b7a 100644
--- a/src/utils/processor.ts
+++ b/src/utils/processor.ts
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
     const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
       voice: settings.ttsVoice,
       model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
     });
 
     const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
     const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
       voice: settings.ttsVoice,
       model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
     });
 
     const audioDuration = ttsResult.duration;