From 19975917c5f51e55d45613dda2e0a2160a6193ce Mon Sep 17 00:00:00 2001 From: Talon Date: Wed, 13 May 2026 02:36:46 +0200 Subject: [PATCH] Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter --- src/cli/args.ts | 5 +++++ src/config/config.ts | 4 +++- src/index.ts | 4 ++++ src/interfaces/index.ts | 1 + src/providers/tts/openAITTSProvider.ts | 5 +++-- src/utils/processor.ts | 6 ++++-- 6 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/cli/args.ts b/src/cli/args.ts index 0c4ba58..00d8a10 100644 --- a/src/cli/args.ts +++ b/src/cli/args.ts @@ -13,6 +13,7 @@ export interface CLIArgs { ttsModel?: string; ttsVoice?: string; ttsSpeedFactor?: number; + ttsInstructions?: string; outputDir?: string; tempDir?: string; batchTimeMode?: boolean; @@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs { describe: 'Speed factor for the audio playback', type: 'number' }) + .option('ttsInstructions', { + describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)', + type: 'string' + }) .option('outputDir', { alias: 'o', describe: 'Directory for output files', diff --git a/src/config/config.ts b/src/config/config.ts index 895f4de..050af99 100644 --- a/src/config/config.ts +++ b/src/config/config.ts @@ -18,6 +18,7 @@ export interface Config { ttsProvider: string; ttsVoice: string; ttsSpeedFactor: number; + ttsInstructions?: string; ttsProviders: { [key: string]: TTSProviderConfig; }; @@ -68,10 +69,11 @@ export function getDefaultConfig(): Config { ttsProvider: "openai", ttsVoice: "alloy", ttsSpeedFactor: 1.5, + ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.", ttsProviders: { openai: { apiKey: process.env.OPENAI_API_KEY, - model: "tts-1-hd", + model: "gpt-4o-mini-tts", voice: "alloy" } }, diff --git a/src/index.ts b/src/index.ts index f64b67d..9ff2ec3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -77,6 +77,10 @@ async function main(): Promise { config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice; } + if (argv.ttsInstructions) { + config.ttsInstructions = argv.ttsInstructions; + } + if (argv.saveConfig) { saveConfigToFile(argv.saveConfig, config); } diff --git a/src/interfaces/index.ts b/src/interfaces/index.ts index 69a3cb5..aa65179 100644 --- a/src/interfaces/index.ts +++ b/src/interfaces/index.ts @@ -39,6 +39,7 @@ export interface TTSOptions { voice?: string; model?: string; speedFactor?: number; + instructions?: string; } export interface TTSProviderConfig { diff --git a/src/providers/tts/openAITTSProvider.ts b/src/providers/tts/openAITTSProvider.ts index f8fdff8..42ac3fb 100644 --- a/src/providers/tts/openAITTSProvider.ts +++ b/src/providers/tts/openAITTSProvider.ts @@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider { const mp3 = await this.openai.audio.speech.create({ model: model, - voice: voice as any, // Type casting to any to avoid type issues - input: text + voice: voice as any, + input: text, + ...(options.instructions ? { instructions: options.instructions } : {}) }); // Cost calculation is based on character count diff --git a/src/utils/processor.ts b/src/utils/processor.ts index 35dcacf..46b3b7a 100644 --- a/src/utils/processor.ts +++ b/src/utils/processor.ts @@ -173,7 +173,8 @@ export async function generateAudioDescription( const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { voice: settings.ttsVoice, model: settings.ttsProviders[settings.ttsProvider].model, - speedFactor: settings.ttsSpeedFactor + speedFactor: settings.ttsSpeedFactor, + instructions: settings.ttsInstructions }); const audioDuration = ttsResult.duration; @@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch( const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { voice: settings.ttsVoice, model: settings.ttsProviders[settings.ttsProvider].model, - speedFactor: settings.ttsSpeedFactor + speedFactor: settings.ttsSpeedFactor, + instructions: settings.ttsInstructions }); const audioDuration = ttsResult.duration;