Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter
This commit is contained in:
@@ -13,6 +13,7 @@ export interface CLIArgs {
|
||||
ttsModel?: string;
|
||||
ttsVoice?: string;
|
||||
ttsSpeedFactor?: number;
|
||||
ttsInstructions?: string;
|
||||
outputDir?: string;
|
||||
tempDir?: string;
|
||||
batchTimeMode?: boolean;
|
||||
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
|
||||
describe: 'Speed factor for the audio playback',
|
||||
type: 'number'
|
||||
})
|
||||
.option('ttsInstructions', {
|
||||
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
|
||||
type: 'string'
|
||||
})
|
||||
.option('outputDir', {
|
||||
alias: 'o',
|
||||
describe: 'Directory for output files',
|
||||
|
||||
@@ -18,6 +18,7 @@ export interface Config {
|
||||
ttsProvider: string;
|
||||
ttsVoice: string;
|
||||
ttsSpeedFactor: number;
|
||||
ttsInstructions?: string;
|
||||
ttsProviders: {
|
||||
[key: string]: TTSProviderConfig;
|
||||
};
|
||||
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
|
||||
ttsProvider: "openai",
|
||||
ttsVoice: "alloy",
|
||||
ttsSpeedFactor: 1.5,
|
||||
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
|
||||
ttsProviders: {
|
||||
openai: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
model: "tts-1-hd",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy"
|
||||
}
|
||||
},
|
||||
|
||||
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
|
||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||
}
|
||||
|
||||
if (argv.ttsInstructions) {
|
||||
config.ttsInstructions = argv.ttsInstructions;
|
||||
}
|
||||
|
||||
if (argv.saveConfig) {
|
||||
saveConfigToFile(argv.saveConfig, config);
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ export interface TTSOptions {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
speedFactor?: number;
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
export interface TTSProviderConfig {
|
||||
|
||||
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
|
||||
|
||||
const mp3 = await this.openai.audio.speech.create({
|
||||
model: model,
|
||||
voice: voice as any, // Type casting to any to avoid type issues
|
||||
input: text
|
||||
voice: voice as any,
|
||||
input: text,
|
||||
...(options.instructions ? { instructions: options.instructions } : {})
|
||||
});
|
||||
|
||||
// Cost calculation is based on character count
|
||||
|
||||
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
|
||||
Reference in New Issue
Block a user