Compare commits

..

5 Commits

16 changed files with 1478 additions and 169 deletions

895
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -19,6 +19,7 @@
"prepublishOnly": "npm run build" "prepublishOnly": "npm run build"
}, },
"dependencies": { "dependencies": {
"@google-cloud/text-to-speech": "^6.4.1",
"@google/generative-ai": "^0.24.0", "@google/generative-ai": "^0.24.0",
"axios": "^1.6.2", "axios": "^1.6.2",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",

View File

@@ -13,6 +13,7 @@ export interface CLIArgs {
ttsModel?: string; ttsModel?: string;
ttsVoice?: string; ttsVoice?: string;
ttsSpeedFactor?: number; ttsSpeedFactor?: number;
ttsInstructions?: string;
outputDir?: string; outputDir?: string;
tempDir?: string; tempDir?: string;
batchTimeMode?: boolean; batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
describe: 'Speed factor for the audio playback', describe: 'Speed factor for the audio playback',
type: 'number' type: 'number'
}) })
.option('ttsInstructions', {
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
type: 'string'
})
.option('outputDir', { .option('outputDir', {
alias: 'o', alias: 'o',
describe: 'Directory for output files', describe: 'Directory for output files',

View File

@@ -18,6 +18,7 @@ export interface Config {
ttsProvider: string; ttsProvider: string;
ttsVoice: string; ttsVoice: string;
ttsSpeedFactor: number; ttsSpeedFactor: number;
ttsInstructions?: string;
ttsProviders: { ttsProviders: {
[key: string]: TTSProviderConfig; [key: string]: TTSProviderConfig;
}; };
@@ -61,6 +62,12 @@ export function getDefaultConfig(): Config {
baseUrl: "http://localhost:11434", baseUrl: "http://localhost:11434",
model: "gemma3:12b", model: "gemma3:12b",
maxTokens: 3000 maxTokens: 3000
},
openrouter: {
apiKey: process.env.OPENROUTER_API_KEY,
model: "anthropic/claude-sonnet-4.5",
baseUrl: "https://openrouter.ai/api/v1",
maxTokens: 300
} }
}, },
@@ -68,11 +75,23 @@ export function getDefaultConfig(): Config {
ttsProvider: "openai", ttsProvider: "openai",
ttsVoice: "alloy", ttsVoice: "alloy",
ttsSpeedFactor: 1.5, ttsSpeedFactor: 1.5,
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
ttsProviders: { ttsProviders: {
openai: { openai: {
apiKey: process.env.OPENAI_API_KEY, apiKey: process.env.OPENAI_API_KEY,
model: "tts-1-hd", model: "gpt-4o-mini-tts",
voice: "alloy" voice: "alloy"
},
elevenlabs: {
apiKey: process.env.ELEVENLABS_API_KEY,
model: "eleven_multilingual_v2",
voice: "JBFqnCBsd6RMkjVDRZzb"
},
google: {
apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
model: "chirp-hd",
voice: "en-US-Chirp-HD-F"
} }
}, },

View File

@@ -77,6 +77,10 @@ async function main(): Promise<void> {
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice; config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
} }
if (argv.ttsInstructions) {
config.ttsInstructions = argv.ttsInstructions;
}
if (argv.saveConfig) { if (argv.saveConfig) {
saveConfigToFile(argv.saveConfig, config); saveConfigToFile(argv.saveConfig, config);
} }

View File

@@ -39,12 +39,14 @@ export interface TTSOptions {
voice?: string; voice?: string;
model?: string; model?: string;
speedFactor?: number; speedFactor?: number;
instructions?: string;
} }
export interface TTSProviderConfig { export interface TTSProviderConfig {
apiKey?: string; apiKey?: string;
model: string; model: string;
voice?: string; voice?: string;
keyFilename?: string;
} }
export interface TTSProvider { export interface TTSProvider {

View File

@@ -0,0 +1,93 @@
import fs from 'fs';
import { execSync } from 'child_process';
import axios, { AxiosInstance } from 'axios';
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
import { getAudioDuration } from '../../utils/mediaUtils';
export class ElevenLabsTTSProvider implements TTSProvider {
private config: TTSProviderConfig;
private axiosInstance: AxiosInstance;
private lastRequestId: string | null = null;
constructor(config: TTSProviderConfig) {
this.config = config;
this.axiosInstance = axios.create({
baseURL: 'https://api.elevenlabs.io/v1',
headers: {
'xi-api-key': config.apiKey,
'Content-Type': 'application/json'
}
});
}
async textToSpeech(
text: string,
outputPath: string,
options: TTSOptions = {}
): Promise<TTSResult> {
try {
const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
const model = options.model || this.config.model || 'eleven_multilingual_v2';
const speedFactor = options.speedFactor || 1.0;
const requestBody: any = {
text,
model_id: model,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
speed: speedFactor,
use_speaker_boost: true
}
};
if (this.lastRequestId) {
requestBody.previous_request_ids = [this.lastRequestId];
}
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
const response = await this.axiosInstance.post(
`/text-to-speech/${voice}`,
requestBody,
{
params: { output_format: 'mp3_44100_128' },
responseType: 'arraybuffer'
}
);
this.lastRequestId = response.headers['request-id'] || null;
const audioBuffer = Buffer.from(response.data);
fs.writeFileSync(tempOutputPath, audioBuffer);
const cost = text.length;
if (speedFactor !== 1.0) {
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
fs.unlinkSync(tempOutputPath);
} else {
fs.renameSync(tempOutputPath, outputPath);
}
const audioDuration = getAudioDuration(outputPath);
return {
duration: audioDuration,
cost: cost
};
} catch (error: any) {
if (error.response) {
console.error(`ElevenLabs TTS error (${error.response.status}):`,
Buffer.from(error.response.data).toString());
} else {
console.error('ElevenLabs TTS error:', error.message);
}
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
return {
duration: 1,
cost: 0
};
}
}
}

View File

@@ -0,0 +1,94 @@
import fs from 'fs';
import { execSync } from 'child_process';
import { TextToSpeechClient } from '@google-cloud/text-to-speech';
import { google } from '@google-cloud/text-to-speech/build/protos/protos';
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
import { getAudioDuration } from '../../utils/mediaUtils';
export class GoogleCloudTTSProvider implements TTSProvider {
private config: TTSProviderConfig;
private client: TextToSpeechClient;
constructor(config: TTSProviderConfig) {
this.config = config;
const clientConfig: any = {
apiKey: config.apiKey,
fallback: true
};
if (config.keyFilename) {
clientConfig.keyFilename = config.keyFilename;
}
this.client = new TextToSpeechClient(clientConfig);
}
async textToSpeech(
text: string,
outputPath: string,
options: TTSOptions = {}
): Promise<TTSResult> {
try {
const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
const model = options.model || this.config.model || 'chirp-hd';
const speedFactor = options.speedFactor || 1.0;
const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
input: { text },
voice: {
languageCode: this.extractLanguageCode(voice),
name: voice
},
audioConfig: {
audioEncoding: 'MP3',
speakingRate: speedFactor
}
};
const [response] = await this.client.synthesizeSpeech(request);
if (!response.audioContent) {
throw new Error('No audio content returned from Google Cloud TTS');
}
const audioBuffer = response.audioContent instanceof Uint8Array
? Buffer.from(response.audioContent)
: Buffer.from(response.audioContent as any);
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
fs.writeFileSync(tempOutputPath, audioBuffer);
const cost = text.length;
if (speedFactor !== 1.0) {
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
fs.unlinkSync(tempOutputPath);
} else {
fs.renameSync(tempOutputPath, outputPath);
}
const audioDuration = getAudioDuration(outputPath);
return {
duration: audioDuration,
cost: cost
};
} catch (error: any) {
console.error('Google Cloud TTS error:', error.message);
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
return {
duration: 1,
cost: 0
};
}
}
private extractLanguageCode(voiceName: string): string {
const parts = voiceName.split('-');
if (parts.length >= 2) {
return `${parts[0]}-${parts[1]}`;
}
return 'en-US';
}
}

View File

@@ -1,2 +1,4 @@
export * from './ttsProviderFactory'; export * from './ttsProviderFactory';
export * from './openAITTSProvider'; export * from './openAITTSProvider';
export * from './elevenLabsTTSProvider';
export * from './googleCloudTTSProvider';

View File

@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
const mp3 = await this.openai.audio.speech.create({ const mp3 = await this.openai.audio.speech.create({
model: model, model: model,
voice: voice as any, // Type casting to any to avoid type issues voice: voice as any,
input: text input: text,
...(options.instructions ? { instructions: options.instructions } : {})
}); });
// Cost calculation is based on character count // Cost calculation is based on character count

View File

@@ -1,6 +1,8 @@
import { TTSProvider } from '../../interfaces'; import { TTSProvider } from '../../interfaces';
import { Config } from '../../config/config'; import { Config } from '../../config/config';
import { OpenAITTSProvider } from './openAITTSProvider'; import { OpenAITTSProvider } from './openAITTSProvider';
import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';
/** /**
* Factory for creating TTS providers * Factory for creating TTS providers
@@ -17,6 +19,10 @@ export class TTSProviderFactory {
switch (providerName) { switch (providerName) {
case 'openai': case 'openai':
return new OpenAITTSProvider(providerConfig); return new OpenAITTSProvider(providerConfig);
case 'elevenlabs':
return new ElevenLabsTTSProvider(providerConfig);
case 'google':
return new GoogleCloudTTSProvider(providerConfig);
// Add other providers here // Add other providers here
default: default:
throw new Error(`TTS provider "${providerName}" not implemented.`); throw new Error(`TTS provider "${providerName}" not implemented.`);

View File

@@ -2,3 +2,4 @@ export * from './visionProviderFactory';
export * from './openAIVisionProvider'; export * from './openAIVisionProvider';
export * from './geminiVisionProvider'; export * from './geminiVisionProvider';
export * from './ollamaVisionProvider'; export * from './ollamaVisionProvider';
export * from './openRouterVisionProvider';

View File

@@ -0,0 +1,171 @@
import fs from 'fs';
import axios, { AxiosInstance } from 'axios';
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
export class OpenRouterVisionProvider implements VisionProvider {
private config: VisionProviderConfig;
private axiosInstance: AxiosInstance;
constructor(config: VisionProviderConfig) {
this.config = config;
this.axiosInstance = axios.create({
baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
headers: {
'Authorization': `Bearer ${config.apiKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
'X-Title': 'Aidio Description Generator'
}
});
}
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
try {
const imageData = fs.readFileSync(imagePath);
const base64Image = imageData.toString('base64');
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
temperature: 0.1,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64Image}`
}
}
]
}
],
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter describeImage error:', error.response?.data || error.message);
return {
description: 'Unable to describe this image.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
try {
const image1Data = fs.readFileSync(image1Path);
const image2Data = fs.readFileSync(image2Path);
const base64Image1 = image1Data.toString('base64');
const base64Image2 = image2Data.toString('base64');
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
temperature: 0.1,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
},
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
}
]
}
],
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter compareImages error:', error.response?.data || error.message);
return {
description: 'Unable to describe the differences between these images.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
async describeBatch(
imagePaths: string[],
lastBatchContext: BatchContext,
prompt: string
): Promise<VisionResult> {
try {
const imagesBase64 = imagePaths.map(fp => {
const imageData = fs.readFileSync(fp);
return imageData.toString('base64');
});
const messages: any[] = [
{
role: 'user',
content: [
{ type: 'text', text: prompt }
]
}
];
if (lastBatchContext && lastBatchContext.lastDescription) {
messages.unshift({
role: 'system',
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
});
}
imagesBase64.forEach(base64 => {
messages[messages.length - 1].content.push({
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64}`
}
});
});
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
messages,
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
return {
description: 'Unable to describe this batch of images.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
}

View File

@@ -3,6 +3,7 @@ import { Config } from '../../config/config';
import { OpenAIVisionProvider } from './openAIVisionProvider'; import { OpenAIVisionProvider } from './openAIVisionProvider';
import { GeminiVisionProvider } from './geminiVisionProvider'; import { GeminiVisionProvider } from './geminiVisionProvider';
import { OllamaVisionProvider } from './ollamaVisionProvider'; import { OllamaVisionProvider } from './ollamaVisionProvider';
import { OpenRouterVisionProvider } from './openRouterVisionProvider';
/** /**
* Factory for creating vision AI providers * Factory for creating vision AI providers
@@ -23,6 +24,8 @@ export class VisionProviderFactory {
return new GeminiVisionProvider(providerConfig); return new GeminiVisionProvider(providerConfig);
case "ollama": case "ollama":
return new OllamaVisionProvider(providerConfig); return new OllamaVisionProvider(providerConfig);
case 'openrouter':
return new OpenRouterVisionProvider(providerConfig);
// Add other providers here // Add other providers here
default: default:
throw new Error(`Vision provider "${providerName}" not implemented.`); throw new Error(`Vision provider "${providerName}" not implemented.`);

View File

@@ -2,6 +2,8 @@ import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces'; import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils'; import { getVideoDuration } from './mediaUtils';
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
/** /**
* Estimate the cost of generating audio descriptions for a video * Estimate the cost of generating audio descriptions for a video
* @param videoFilePath - Path to the input video file * @param videoFilePath - Path to the input video file
@@ -26,39 +28,54 @@ export async function estimateCost(
if (settings.batchTimeMode) { if (settings.batchTimeMode) {
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration); totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode unitCostMultiplier = settings.framesInBatch;
unitType = "batches"; unitType = "batches";
} else { } else {
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds); totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
unitCostMultiplier = 1; // No multiplier for normal mode unitCostMultiplier = 1;
unitType = "frames"; unitType = "frames";
} }
console.log(`Will process ${totalUnits} ${unitType}`); console.log(`Will process ${totalUnits} ${unitType}`);
// Pricing constants (as of March 2025, update as needed) // Pricing constants (per 1K units unless otherwise noted)
const pricing: { const pricing: {
vision: Record<string, Record<string, { input: number; output: number }>>; vision: Record<string, Record<string, { input: number; output: number }>>;
tts: Record<string, Record<string, number>>; tts: Record<string, Record<string, TTSPricingModel>>;
} = { } = {
vision: { vision: {
openai: { openai: {
'gpt-4o': { 'gpt-4o': { input: 0.0025, output: 0.01 },
input: 0.0025, 'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
output: 0.01 'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
}
}, },
gemini: { gemini: {
'gemini-pro-vision': { 'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
input: 0.0025, 'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
output: 0.0025 'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
} },
openrouter: {
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
} }
}, },
tts: { tts: {
openai: { openai: {
'tts-1': 0.015, 'tts-1': 0.015,
'tts-1-hd': 0.030 'tts-1-hd': 0.030,
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
},
elevenlabs: {
'eleven_multilingual_v2': 0.30,
'eleven_turbo_v2.5': 0.015
},
google: {
'chirp-hd': 0.016,
'wavenet': 0.016,
'neural2': 0.016,
'standard': 0.004
} }
} }
}; };
@@ -82,27 +99,27 @@ export async function estimateCost(
} }
// Estimated token counts // Estimated token counts
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
const estimatedPromptTokens = 100; // Tokens for the prompt text const estimatedPromptTokens = 100;
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output const estimatedOutputTokensPerUnit = 75;
// Estimated character counts for TTS // Estimated character counts for TTS
const estimatedCharsPerDescription = 200; // Average characters per description const estimatedCharsPerDescription = 200;
// Calculate estimated costs for first unit // Calculate estimated costs for first unit
const firstUnitCost = { const firstUnitCost = {
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
}; };
// For subsequent units, we need context (e.g., previous frames) // For subsequent units, we need context (e.g., previous frames)
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
const subsequentUnitCost = { const subsequentUnitCost = {
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
}; };
// Calculate total costs // Calculate total costs
@@ -141,10 +158,25 @@ export async function estimateCost(
total: totalCost.toFixed(4) total: totalCost.toFixed(4)
}, },
estimates: { estimates: {
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit totalAPICallsToProviders: totalUnits * 2,
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
} }
}; };
return costBreakdown; return costBreakdown;
} }
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
if (!pricing) return 0;
if (typeof pricing === 'number') {
// Per-character pricing: cost per 1000 characters
return charCount * pricing / 1000;
}
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
// Rough estimate: 1 char ≈ 0.25 tokens for English text
const estimatedInputTokens = charCount * 0.25;
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
}

View File

@@ -173,7 +173,8 @@ export async function generateAudioDescription(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice, voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model, model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
}); });
const audioDuration = ttsResult.duration; const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice, voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model, model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
}); });
const audioDuration = ttsResult.duration; const audioDuration = ttsResult.duration;