Compare commits

...

5 Commits

16 changed files with 1478 additions and 169 deletions

895
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -19,6 +19,7 @@
"prepublishOnly": "npm run build"
},
"dependencies": {
"@google-cloud/text-to-speech": "^6.4.1",
"@google/generative-ai": "^0.24.0",
"axios": "^1.6.2",
"dotenv": "^16.3.1",
@@ -51,4 +52,4 @@
],
"author": "",
"license": "MIT"
}
}

View File

@@ -13,6 +13,7 @@ export interface CLIArgs {
ttsModel?: string;
ttsVoice?: string;
ttsSpeedFactor?: number;
ttsInstructions?: string;
outputDir?: string;
tempDir?: string;
batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
describe: 'Speed factor for the audio playback',
type: 'number'
})
.option('ttsInstructions', {
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
type: 'string'
})
.option('outputDir', {
alias: 'o',
describe: 'Directory for output files',

View File

@@ -18,6 +18,7 @@ export interface Config {
ttsProvider: string;
ttsVoice: string;
ttsSpeedFactor: number;
ttsInstructions?: string;
ttsProviders: {
[key: string]: TTSProviderConfig;
};
@@ -61,6 +62,12 @@ export function getDefaultConfig(): Config {
baseUrl: "http://localhost:11434",
model: "gemma3:12b",
maxTokens: 3000
},
openrouter: {
apiKey: process.env.OPENROUTER_API_KEY,
model: "anthropic/claude-sonnet-4.5",
baseUrl: "https://openrouter.ai/api/v1",
maxTokens: 300
}
},
@@ -68,11 +75,23 @@ export function getDefaultConfig(): Config {
ttsProvider: "openai",
ttsVoice: "alloy",
ttsSpeedFactor: 1.5,
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
ttsProviders: {
openai: {
apiKey: process.env.OPENAI_API_KEY,
model: "tts-1-hd",
model: "gpt-4o-mini-tts",
voice: "alloy"
},
elevenlabs: {
apiKey: process.env.ELEVENLABS_API_KEY,
model: "eleven_multilingual_v2",
voice: "JBFqnCBsd6RMkjVDRZzb"
},
google: {
apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
model: "chirp-hd",
voice: "en-US-Chirp-HD-F"
}
},

View File

@@ -77,6 +77,10 @@ async function main(): Promise<void> {
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
}
if (argv.ttsInstructions) {
config.ttsInstructions = argv.ttsInstructions;
}
if (argv.saveConfig) {
saveConfigToFile(argv.saveConfig, config);
}

View File

@@ -39,12 +39,14 @@ export interface TTSOptions {
voice?: string;
model?: string;
speedFactor?: number;
instructions?: string;
}
export interface TTSProviderConfig {
apiKey?: string;
model: string;
voice?: string;
keyFilename?: string;
}
export interface TTSProvider {

View File

@@ -0,0 +1,93 @@
import fs from 'fs';
import { execSync } from 'child_process';
import axios, { AxiosInstance } from 'axios';
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
import { getAudioDuration } from '../../utils/mediaUtils';
export class ElevenLabsTTSProvider implements TTSProvider {
private config: TTSProviderConfig;
private axiosInstance: AxiosInstance;
private lastRequestId: string | null = null;
constructor(config: TTSProviderConfig) {
this.config = config;
this.axiosInstance = axios.create({
baseURL: 'https://api.elevenlabs.io/v1',
headers: {
'xi-api-key': config.apiKey,
'Content-Type': 'application/json'
}
});
}
async textToSpeech(
text: string,
outputPath: string,
options: TTSOptions = {}
): Promise<TTSResult> {
try {
const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
const model = options.model || this.config.model || 'eleven_multilingual_v2';
const speedFactor = options.speedFactor || 1.0;
const requestBody: any = {
text,
model_id: model,
voice_settings: {
stability: 0.5,
similarity_boost: 0.75,
speed: speedFactor,
use_speaker_boost: true
}
};
if (this.lastRequestId) {
requestBody.previous_request_ids = [this.lastRequestId];
}
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
const response = await this.axiosInstance.post(
`/text-to-speech/${voice}`,
requestBody,
{
params: { output_format: 'mp3_44100_128' },
responseType: 'arraybuffer'
}
);
this.lastRequestId = response.headers['request-id'] || null;
const audioBuffer = Buffer.from(response.data);
fs.writeFileSync(tempOutputPath, audioBuffer);
const cost = text.length;
if (speedFactor !== 1.0) {
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
fs.unlinkSync(tempOutputPath);
} else {
fs.renameSync(tempOutputPath, outputPath);
}
const audioDuration = getAudioDuration(outputPath);
return {
duration: audioDuration,
cost: cost
};
} catch (error: any) {
if (error.response) {
console.error(`ElevenLabs TTS error (${error.response.status}):`,
Buffer.from(error.response.data).toString());
} else {
console.error('ElevenLabs TTS error:', error.message);
}
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
return {
duration: 1,
cost: 0
};
}
}
}

View File

@@ -0,0 +1,94 @@
import fs from 'fs';
import { execSync } from 'child_process';
import { TextToSpeechClient } from '@google-cloud/text-to-speech';
import { google } from '@google-cloud/text-to-speech/build/protos/protos';
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
import { getAudioDuration } from '../../utils/mediaUtils';
export class GoogleCloudTTSProvider implements TTSProvider {
private config: TTSProviderConfig;
private client: TextToSpeechClient;
constructor(config: TTSProviderConfig) {
this.config = config;
const clientConfig: any = {
apiKey: config.apiKey,
fallback: true
};
if (config.keyFilename) {
clientConfig.keyFilename = config.keyFilename;
}
this.client = new TextToSpeechClient(clientConfig);
}
async textToSpeech(
text: string,
outputPath: string,
options: TTSOptions = {}
): Promise<TTSResult> {
try {
const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
const model = options.model || this.config.model || 'chirp-hd';
const speedFactor = options.speedFactor || 1.0;
const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
input: { text },
voice: {
languageCode: this.extractLanguageCode(voice),
name: voice
},
audioConfig: {
audioEncoding: 'MP3',
speakingRate: speedFactor
}
};
const [response] = await this.client.synthesizeSpeech(request);
if (!response.audioContent) {
throw new Error('No audio content returned from Google Cloud TTS');
}
const audioBuffer = response.audioContent instanceof Uint8Array
? Buffer.from(response.audioContent)
: Buffer.from(response.audioContent as any);
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
fs.writeFileSync(tempOutputPath, audioBuffer);
const cost = text.length;
if (speedFactor !== 1.0) {
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
fs.unlinkSync(tempOutputPath);
} else {
fs.renameSync(tempOutputPath, outputPath);
}
const audioDuration = getAudioDuration(outputPath);
return {
duration: audioDuration,
cost: cost
};
} catch (error: any) {
console.error('Google Cloud TTS error:', error.message);
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
return {
duration: 1,
cost: 0
};
}
}
private extractLanguageCode(voiceName: string): string {
const parts = voiceName.split('-');
if (parts.length >= 2) {
return `${parts[0]}-${parts[1]}`;
}
return 'en-US';
}
}

View File

@@ -1,2 +1,4 @@
export * from './ttsProviderFactory';
export * from './openAITTSProvider';
export * from './openAITTSProvider';
export * from './elevenLabsTTSProvider';
export * from './googleCloudTTSProvider';

View File

@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
const mp3 = await this.openai.audio.speech.create({
model: model,
voice: voice as any, // Type casting to any to avoid type issues
input: text
voice: voice as any,
input: text,
...(options.instructions ? { instructions: options.instructions } : {})
});
// Cost calculation is based on character count

View File

@@ -1,6 +1,8 @@
import { TTSProvider } from '../../interfaces';
import { Config } from '../../config/config';
import { OpenAITTSProvider } from './openAITTSProvider';
import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';
/**
* Factory for creating TTS providers
@@ -17,6 +19,10 @@ export class TTSProviderFactory {
switch (providerName) {
case 'openai':
return new OpenAITTSProvider(providerConfig);
case 'elevenlabs':
return new ElevenLabsTTSProvider(providerConfig);
case 'google':
return new GoogleCloudTTSProvider(providerConfig);
// Add other providers here
default:
throw new Error(`TTS provider "${providerName}" not implemented.`);

View File

@@ -1,4 +1,5 @@
export * from './visionProviderFactory';
export * from './openAIVisionProvider';
export * from './geminiVisionProvider';
export * from './ollamaVisionProvider';
export * from './ollamaVisionProvider';
export * from './openRouterVisionProvider';

View File

@@ -0,0 +1,171 @@
import fs from 'fs';
import axios, { AxiosInstance } from 'axios';
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
export class OpenRouterVisionProvider implements VisionProvider {
private config: VisionProviderConfig;
private axiosInstance: AxiosInstance;
constructor(config: VisionProviderConfig) {
this.config = config;
this.axiosInstance = axios.create({
baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
headers: {
'Authorization': `Bearer ${config.apiKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
'X-Title': 'Aidio Description Generator'
}
});
}
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
try {
const imageData = fs.readFileSync(imagePath);
const base64Image = imageData.toString('base64');
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
temperature: 0.1,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64Image}`
}
}
]
}
],
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter describeImage error:', error.response?.data || error.message);
return {
description: 'Unable to describe this image.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
try {
const image1Data = fs.readFileSync(image1Path);
const image2Data = fs.readFileSync(image2Path);
const base64Image1 = image1Data.toString('base64');
const base64Image2 = image2Data.toString('base64');
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
temperature: 0.1,
messages: [
{
role: 'user',
content: [
{ type: 'text', text: prompt },
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
},
{
type: 'image_url',
image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
}
]
}
],
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter compareImages error:', error.response?.data || error.message);
return {
description: 'Unable to describe the differences between these images.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
async describeBatch(
imagePaths: string[],
lastBatchContext: BatchContext,
prompt: string
): Promise<VisionResult> {
try {
const imagesBase64 = imagePaths.map(fp => {
const imageData = fs.readFileSync(fp);
return imageData.toString('base64');
});
const messages: any[] = [
{
role: 'user',
content: [
{ type: 'text', text: prompt }
]
}
];
if (lastBatchContext && lastBatchContext.lastDescription) {
messages.unshift({
role: 'system',
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
});
}
imagesBase64.forEach(base64 => {
messages[messages.length - 1].content.push({
type: 'image_url',
image_url: {
url: `data:image/jpeg;base64,${base64}`
}
});
});
const response = await this.axiosInstance.post('/chat/completions', {
model: this.config.model,
messages,
max_tokens: this.config.maxTokens || 300
});
const data = response.data;
return {
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
usage: {
inputTokens: data.usage?.prompt_tokens || 0,
outputTokens: data.usage?.completion_tokens || 0,
totalTokens: data.usage?.total_tokens || 0
}
};
} catch (error: any) {
console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
return {
description: 'Unable to describe this batch of images.',
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
}

View File

@@ -3,6 +3,7 @@ import { Config } from '../../config/config';
import { OpenAIVisionProvider } from './openAIVisionProvider';
import { GeminiVisionProvider } from './geminiVisionProvider';
import { OllamaVisionProvider } from './ollamaVisionProvider';
import { OpenRouterVisionProvider } from './openRouterVisionProvider';
/**
* Factory for creating vision AI providers
@@ -23,6 +24,8 @@ export class VisionProviderFactory {
return new GeminiVisionProvider(providerConfig);
case "ollama":
return new OllamaVisionProvider(providerConfig);
case 'openrouter':
return new OpenRouterVisionProvider(providerConfig);
// Add other providers here
default:
throw new Error(`Vision provider "${providerName}" not implemented.`);

View File

@@ -1,150 +1,182 @@
import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils';
/**
* Estimate the cost of generating audio descriptions for a video
* @param videoFilePath - Path to the input video file
* @param options - Optional configuration overrides
* @returns Cost estimation breakdown
*/
export async function estimateCost(
videoFilePath: string,
options: Partial<Config> = {}
): Promise<CostBreakdown> {
// Merge provided options with defaults
const settings = { ...options } as Config;
// Get video duration
const videoDuration = getVideoDuration(videoFilePath);
console.log(`Video duration: ${videoDuration} seconds`);
// Calculate the number of frames or batches to process
let totalUnits: number;
let unitCostMultiplier: number;
let unitType: string;
if (settings.batchTimeMode) {
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
unitType = "batches";
} else {
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
unitCostMultiplier = 1; // No multiplier for normal mode
unitType = "frames";
}
console.log(`Will process ${totalUnits} ${unitType}`);
// Pricing constants (as of March 2025, update as needed)
const pricing: {
vision: Record<string, Record<string, { input: number; output: number }>>;
tts: Record<string, Record<string, number>>;
} = {
vision: {
openai: {
'gpt-4o': {
input: 0.0025,
output: 0.01
}
},
gemini: {
'gemini-pro-vision': {
input: 0.0025,
output: 0.0025
}
}
},
tts: {
openai: {
'tts-1': 0.015,
'tts-1-hd': 0.030
}
}
};
// Get the pricing for the selected providers
const visionProvider = settings.visionProvider;
const visionModel = settings.visionProviders[visionProvider].model;
const ttsProvider = settings.ttsProvider;
const ttsModel = settings.ttsProviders[ttsProvider].model;
// Check if the pricing data exists
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
if (!visionPricing) {
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
}
if (!ttsPricing) {
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
}
// Estimated token counts
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
const estimatedPromptTokens = 100; // Tokens for the prompt text
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
// Estimated character counts for TTS
const estimatedCharsPerDescription = 200; // Average characters per description
// Calculate estimated costs for first unit
const firstUnitCost = {
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
};
// For subsequent units, we need context (e.g., previous frames)
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
const subsequentUnitCost = {
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
};
// Calculate total costs
const totalVisionInputCost =
firstUnitCost.visionInput +
(totalUnits - 1) * subsequentUnitCost.visionInput;
const totalVisionOutputCost =
firstUnitCost.visionOutput +
(totalUnits - 1) * subsequentUnitCost.visionOutput;
const totalTTSCost =
firstUnitCost.tts +
(totalUnits - 1) * subsequentUnitCost.tts;
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
// Create cost breakdown
const costBreakdown: CostBreakdown = {
videoInfo: {
duration: videoDuration,
totalUnits: totalUnits,
unitType: unitType,
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
},
providerInfo: {
visionProvider: visionProvider,
visionModel: visionModel,
ttsProvider: ttsProvider,
ttsModel: ttsModel
},
apiCosts: {
visionInput: totalVisionInputCost.toFixed(4),
visionOutput: totalVisionOutputCost.toFixed(4),
tts: totalTTSCost.toFixed(4),
total: totalCost.toFixed(4)
},
estimates: {
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
}
};
return costBreakdown;
}
import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils';
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
/**
* Estimate the cost of generating audio descriptions for a video
* @param videoFilePath - Path to the input video file
* @param options - Optional configuration overrides
* @returns Cost estimation breakdown
*/
export async function estimateCost(
videoFilePath: string,
options: Partial<Config> = {}
): Promise<CostBreakdown> {
// Merge provided options with defaults
const settings = { ...options } as Config;
// Get video duration
const videoDuration = getVideoDuration(videoFilePath);
console.log(`Video duration: ${videoDuration} seconds`);
// Calculate the number of frames or batches to process
let totalUnits: number;
let unitCostMultiplier: number;
let unitType: string;
if (settings.batchTimeMode) {
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
unitCostMultiplier = settings.framesInBatch;
unitType = "batches";
} else {
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
unitCostMultiplier = 1;
unitType = "frames";
}
console.log(`Will process ${totalUnits} ${unitType}`);
// Pricing constants (per 1K units unless otherwise noted)
const pricing: {
vision: Record<string, Record<string, { input: number; output: number }>>;
tts: Record<string, Record<string, TTSPricingModel>>;
} = {
vision: {
openai: {
'gpt-4o': { input: 0.0025, output: 0.01 },
'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
},
gemini: {
'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
},
openrouter: {
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
}
},
tts: {
openai: {
'tts-1': 0.015,
'tts-1-hd': 0.030,
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
},
elevenlabs: {
'eleven_multilingual_v2': 0.30,
'eleven_turbo_v2.5': 0.015
},
google: {
'chirp-hd': 0.016,
'wavenet': 0.016,
'neural2': 0.016,
'standard': 0.004
}
}
};
// Get the pricing for the selected providers
const visionProvider = settings.visionProvider;
const visionModel = settings.visionProviders[visionProvider].model;
const ttsProvider = settings.ttsProvider;
const ttsModel = settings.ttsProviders[ttsProvider].model;
// Check if the pricing data exists
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
if (!visionPricing) {
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
}
if (!ttsPricing) {
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
}
// Estimated token counts
const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
const estimatedPromptTokens = 100;
const estimatedOutputTokensPerUnit = 75;
// Estimated character counts for TTS
const estimatedCharsPerDescription = 200;
// Calculate estimated costs for first unit
const firstUnitCost = {
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
};
// For subsequent units, we need context (e.g., previous frames)
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
const subsequentUnitCost = {
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
};
// Calculate total costs
const totalVisionInputCost =
firstUnitCost.visionInput +
(totalUnits - 1) * subsequentUnitCost.visionInput;
const totalVisionOutputCost =
firstUnitCost.visionOutput +
(totalUnits - 1) * subsequentUnitCost.visionOutput;
const totalTTSCost =
firstUnitCost.tts +
(totalUnits - 1) * subsequentUnitCost.tts;
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
// Create cost breakdown
const costBreakdown: CostBreakdown = {
videoInfo: {
duration: videoDuration,
totalUnits: totalUnits,
unitType: unitType,
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
},
providerInfo: {
visionProvider: visionProvider,
visionModel: visionModel,
ttsProvider: ttsProvider,
ttsModel: ttsModel
},
apiCosts: {
visionInput: totalVisionInputCost.toFixed(4),
visionOutput: totalVisionOutputCost.toFixed(4),
tts: totalTTSCost.toFixed(4),
total: totalCost.toFixed(4)
},
estimates: {
totalAPICallsToProviders: totalUnits * 2,
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
}
};
return costBreakdown;
}
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
if (!pricing) return 0;
if (typeof pricing === 'number') {
// Per-character pricing: cost per 1000 characters
return charCount * pricing / 1000;
}
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
// Rough estimate: 1 char ≈ 0.25 tokens for English text
const estimatedInputTokens = charCount * 0.25;
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
}

View File

@@ -173,7 +173,8 @@ export async function generateAudioDescription(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor
speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
});
const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
voice: settings.ttsVoice,
model: settings.ttsProviders[settings.ttsProvider].model,
speedFactor: settings.ttsSpeedFactor
speedFactor: settings.ttsSpeedFactor,
instructions: settings.ttsInstructions
});
const audioDuration = ttsResult.duration;