Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter

This commit is contained in:
2026-05-13 02:36:46 +02:00
parent fc02d2001c
commit 19975917c5
6 changed files with 20 additions and 5 deletions

View File

@@ -13,6 +13,7 @@ export interface CLIArgs {
ttsModel?: string;
ttsVoice?: string;
ttsSpeedFactor?: number;
ttsInstructions?: string;
outputDir?: string;
tempDir?: string;
batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
describe: 'Speed factor for the audio playback',
type: 'number'
})
.option('ttsInstructions', {
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
type: 'string'
})
.option('outputDir', {
alias: 'o',
describe: 'Directory for output files',

View File

@@ -18,6 +18,7 @@ export interface Config {
ttsProvider: string;
ttsVoice: string;
ttsSpeedFactor: number;
ttsInstructions?: string;
ttsProviders: {
[key: string]: TTSProviderConfig;
};
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
ttsProvider: "openai",
ttsVoice: "alloy",
ttsSpeedFactor: 1.5,
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
ttsProviders: {
openai: {
apiKey: process.env.OPENAI_API_KEY,
model: "tts-1-hd",
model: "gpt-4o-mini-tts",
voice: "alloy"
}
},

View File

@@ -77,6 +77,10 @@ async function main(): Promise<void> {
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
}
if (argv.ttsInstructions) {
config.ttsInstructions = argv.ttsInstructions;
}
if (argv.saveConfig) {
saveConfigToFile(argv.saveConfig, config);
}

View File

@@ -39,6 +39,7 @@ export interface TTSOptions {
voice?: string;
model?: string;
speedFactor?: number;
instructions?: string;
}
export interface TTSProviderConfig {

View File

@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
const mp3 = await this.openai.audio.speech.create({
model: model,
voice: voice as any, // Type casting to any to avoid type issues
input: text
voice: voice as any,
input: text,
...(options.instructions ? { instructions: options.instructions } : {})
});
// Cost calculation is based on character count

View File

@@ -173,7 +173,8 @@ export async function generateAudioDescription(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor
speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
});
const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor
speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
});
const audioDuration = ttsResult.duration;