Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter
This commit is contained in:
@@ -13,6 +13,7 @@ export interface CLIArgs {
|
|||||||
ttsModel?: string;
|
ttsModel?: string;
|
||||||
ttsVoice?: string;
|
ttsVoice?: string;
|
||||||
ttsSpeedFactor?: number;
|
ttsSpeedFactor?: number;
|
||||||
|
ttsInstructions?: string;
|
||||||
outputDir?: string;
|
outputDir?: string;
|
||||||
tempDir?: string;
|
tempDir?: string;
|
||||||
batchTimeMode?: boolean;
|
batchTimeMode?: boolean;
|
||||||
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
|
|||||||
describe: 'Speed factor for the audio playback',
|
describe: 'Speed factor for the audio playback',
|
||||||
type: 'number'
|
type: 'number'
|
||||||
})
|
})
|
||||||
|
.option('ttsInstructions', {
|
||||||
|
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
|
||||||
|
type: 'string'
|
||||||
|
})
|
||||||
.option('outputDir', {
|
.option('outputDir', {
|
||||||
alias: 'o',
|
alias: 'o',
|
||||||
describe: 'Directory for output files',
|
describe: 'Directory for output files',
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ export interface Config {
|
|||||||
ttsProvider: string;
|
ttsProvider: string;
|
||||||
ttsVoice: string;
|
ttsVoice: string;
|
||||||
ttsSpeedFactor: number;
|
ttsSpeedFactor: number;
|
||||||
|
ttsInstructions?: string;
|
||||||
ttsProviders: {
|
ttsProviders: {
|
||||||
[key: string]: TTSProviderConfig;
|
[key: string]: TTSProviderConfig;
|
||||||
};
|
};
|
||||||
@@ -68,10 +69,11 @@ export function getDefaultConfig(): Config {
|
|||||||
ttsProvider: "openai",
|
ttsProvider: "openai",
|
||||||
ttsVoice: "alloy",
|
ttsVoice: "alloy",
|
||||||
ttsSpeedFactor: 1.5,
|
ttsSpeedFactor: 1.5,
|
||||||
|
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
|
||||||
ttsProviders: {
|
ttsProviders: {
|
||||||
openai: {
|
openai: {
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
model: "tts-1-hd",
|
model: "gpt-4o-mini-tts",
|
||||||
voice: "alloy"
|
voice: "alloy"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
|
|||||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.ttsInstructions) {
|
||||||
|
config.ttsInstructions = argv.ttsInstructions;
|
||||||
|
}
|
||||||
|
|
||||||
if (argv.saveConfig) {
|
if (argv.saveConfig) {
|
||||||
saveConfigToFile(argv.saveConfig, config);
|
saveConfigToFile(argv.saveConfig, config);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ export interface TTSOptions {
|
|||||||
voice?: string;
|
voice?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
speedFactor?: number;
|
speedFactor?: number;
|
||||||
|
instructions?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TTSProviderConfig {
|
export interface TTSProviderConfig {
|
||||||
|
|||||||
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
|
|||||||
|
|
||||||
const mp3 = await this.openai.audio.speech.create({
|
const mp3 = await this.openai.audio.speech.create({
|
||||||
model: model,
|
model: model,
|
||||||
voice: voice as any, // Type casting to any to avoid type issues
|
voice: voice as any,
|
||||||
input: text
|
input: text,
|
||||||
|
...(options.instructions ? { instructions: options.instructions } : {})
|
||||||
});
|
});
|
||||||
|
|
||||||
// Cost calculation is based on character count
|
// Cost calculation is based on character count
|
||||||
|
|||||||
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
|
|||||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||||
voice: settings.ttsVoice,
|
voice: settings.ttsVoice,
|
||||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||||
speedFactor: settings.ttsSpeedFactor
|
speedFactor: settings.ttsSpeedFactor,
|
||||||
|
instructions: settings.ttsInstructions
|
||||||
});
|
});
|
||||||
|
|
||||||
const audioDuration = ttsResult.duration;
|
const audioDuration = ttsResult.duration;
|
||||||
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
|
|||||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||||
voice: settings.ttsVoice,
|
voice: settings.ttsVoice,
|
||||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||||
speedFactor: settings.ttsSpeedFactor
|
speedFactor: settings.ttsSpeedFactor,
|
||||||
|
instructions: settings.ttsInstructions
|
||||||
});
|
});
|
||||||
|
|
||||||
const audioDuration = ttsResult.duration;
|
const audioDuration = ttsResult.duration;
|
||||||
|
|||||||
Reference in New Issue
Block a user