Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter

This commit is contained in:
2026-05-13 02:36:46 +02:00
parent fc02d2001c
commit 19975917c5
6 changed files with 20 additions and 5 deletions

View File

@@ -13,6 +13,7 @@ export interface CLIArgs {
ttsModel?: string; ttsModel?: string;
ttsVoice?: string; ttsVoice?: string;
ttsSpeedFactor?: number; ttsSpeedFactor?: number;
ttsInstructions?: string;
outputDir?: string; outputDir?: string;
tempDir?: string; tempDir?: string;
batchTimeMode?: boolean; batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
describe: 'Speed factor for the audio playback', describe: 'Speed factor for the audio playback',
type: 'number' type: 'number'
}) })
.option('ttsInstructions', {
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
type: 'string'
})
.option('outputDir', { .option('outputDir', {
alias: 'o', alias: 'o',
describe: 'Directory for output files', describe: 'Directory for output files',

View File

@@ -18,6 +18,7 @@ export interface Config {
ttsProvider: string; ttsProvider: string;
ttsVoice: string; ttsVoice: string;
ttsSpeedFactor: number; ttsSpeedFactor: number;
ttsInstructions?: string;
ttsProviders: { ttsProviders: {
[key: string]: TTSProviderConfig; [key: string]: TTSProviderConfig;
}; };
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
ttsProvider: "openai", ttsProvider: "openai",
ttsVoice: "alloy", ttsVoice: "alloy",
ttsSpeedFactor: 1.5, ttsSpeedFactor: 1.5,
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
ttsProviders: { ttsProviders: {
openai: { openai: {
apiKey: process.env.OPENAI_API_KEY, apiKey: process.env.OPENAI_API_KEY,
model: "tts-1-hd", model: "gpt-4o-mini-tts",
voice: "alloy" voice: "alloy"
} }
}, },

View File

@@ -77,6 +77,10 @@ async function main(): Promise<void> {
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice; config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
} }
if (argv.ttsInstructions) {
config.ttsInstructions = argv.ttsInstructions;
}
if (argv.saveConfig) { if (argv.saveConfig) {
saveConfigToFile(argv.saveConfig, config); saveConfigToFile(argv.saveConfig, config);
} }

View File

@@ -39,6 +39,7 @@ export interface TTSOptions {
voice?: string; voice?: string;
model?: string; model?: string;
speedFactor?: number; speedFactor?: number;
instructions?: string;
} }
export interface TTSProviderConfig { export interface TTSProviderConfig {

View File

@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
const mp3 = await this.openai.audio.speech.create({ const mp3 = await this.openai.audio.speech.create({
model: model, model: model,
voice: voice as any, // Type casting to any to avoid type issues voice: voice as any,
input: text input: text,
...(options.instructions ? { instructions: options.instructions } : {})
}); });
// Cost calculation is based on character count // Cost calculation is based on character count

View File

@@ -173,7 +173,8 @@ export async function generateAudioDescription(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice, voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model, model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
}); });
const audioDuration = ttsResult.duration; const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice, voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model, model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
}); });
const audioDuration = ttsResult.duration; const audioDuration = ttsResult.duration;