diff --git a/src/utils/costEstimator.ts b/src/utils/costEstimator.ts index 8e9d3fb..50ed3c8 100644 --- a/src/utils/costEstimator.ts +++ b/src/utils/costEstimator.ts @@ -1,150 +1,182 @@ -import { Config } from '../config/config'; -import { CostBreakdown } from '../interfaces'; -import { getVideoDuration } from './mediaUtils'; - -/** - * Estimate the cost of generating audio descriptions for a video - * @param videoFilePath - Path to the input video file - * @param options - Optional configuration overrides - * @returns Cost estimation breakdown - */ -export async function estimateCost( - videoFilePath: string, - options: Partial = {} -): Promise { - // Merge provided options with defaults - const settings = { ...options } as Config; - - // Get video duration - const videoDuration = getVideoDuration(videoFilePath); - console.log(`Video duration: ${videoDuration} seconds`); - - // Calculate the number of frames or batches to process - let totalUnits: number; - let unitCostMultiplier: number; - let unitType: string; - - if (settings.batchTimeMode) { - totalUnits = Math.floor(videoDuration / settings.batchWindowDuration); - unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode - unitType = "batches"; - } else { - totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds); - unitCostMultiplier = 1; // No multiplier for normal mode - unitType = "frames"; - } - - console.log(`Will process ${totalUnits} ${unitType}`); - - // Pricing constants (as of March 2025, update as needed) - const pricing: { - vision: Record>; - tts: Record>; - } = { - vision: { - openai: { - 'gpt-4o': { - input: 0.0025, - output: 0.01 - } - }, - gemini: { - 'gemini-pro-vision': { - input: 0.0025, - output: 0.0025 - } - } - }, - tts: { - openai: { - 'tts-1': 0.015, - 'tts-1-hd': 0.030 - } - } - }; - - // Get the pricing for the selected providers - const visionProvider = settings.visionProvider; - const visionModel = settings.visionProviders[visionProvider].model; - const ttsProvider = settings.ttsProvider; - const ttsModel = settings.ttsProviders[ttsProvider].model; - - // Check if the pricing data exists - const visionPricing = pricing.vision[visionProvider]?.[visionModel]; - const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel]; - - if (!visionPricing) { - console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`); - } - - if (!ttsPricing) { - console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`); - } - - // Estimated token counts - const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input - const estimatedPromptTokens = 100; // Tokens for the prompt text - const estimatedOutputTokensPerUnit = 75; // Average tokens for description output - - // Estimated character counts for TTS - const estimatedCharsPerDescription = 200; // Average characters per description - - // Calculate estimated costs for first unit - const firstUnitCost = { - visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, - visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, - tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 - }; - - // For subsequent units, we need context (e.g., previous frames) - const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode - - const subsequentUnitCost = { - visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, - visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, - tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 - }; - - // Calculate total costs - const totalVisionInputCost = - firstUnitCost.visionInput + - (totalUnits - 1) * subsequentUnitCost.visionInput; - - const totalVisionOutputCost = - firstUnitCost.visionOutput + - (totalUnits - 1) * subsequentUnitCost.visionOutput; - - const totalTTSCost = - firstUnitCost.tts + - (totalUnits - 1) * subsequentUnitCost.tts; - - const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost; - - // Create cost breakdown - const costBreakdown: CostBreakdown = { - videoInfo: { - duration: videoDuration, - totalUnits: totalUnits, - unitType: unitType, - processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds - }, - providerInfo: { - visionProvider: visionProvider, - visionModel: visionModel, - ttsProvider: ttsProvider, - ttsModel: ttsModel - }, - apiCosts: { - visionInput: totalVisionInputCost.toFixed(4), - visionOutput: totalVisionOutputCost.toFixed(4), - tts: totalTTSCost.toFixed(4), - total: totalCost.toFixed(4) - }, - estimates: { - totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit - estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit - } - }; - - return costBreakdown; -} \ No newline at end of file +import { Config } from '../config/config'; +import { CostBreakdown } from '../interfaces'; +import { getVideoDuration } from './mediaUtils'; + +type TTSPricingModel = number | { inputTokens: number; outputTokens: number }; + +/** + * Estimate the cost of generating audio descriptions for a video + * @param videoFilePath - Path to the input video file + * @param options - Optional configuration overrides + * @returns Cost estimation breakdown + */ +export async function estimateCost( + videoFilePath: string, + options: Partial = {} +): Promise { + // Merge provided options with defaults + const settings = { ...options } as Config; + + // Get video duration + const videoDuration = getVideoDuration(videoFilePath); + console.log(`Video duration: ${videoDuration} seconds`); + + // Calculate the number of frames or batches to process + let totalUnits: number; + let unitCostMultiplier: number; + let unitType: string; + + if (settings.batchTimeMode) { + totalUnits = Math.floor(videoDuration / settings.batchWindowDuration); + unitCostMultiplier = settings.framesInBatch; + unitType = "batches"; + } else { + totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds); + unitCostMultiplier = 1; + unitType = "frames"; + } + + console.log(`Will process ${totalUnits} ${unitType}`); + + // Pricing constants (per 1K units unless otherwise noted) + const pricing: { + vision: Record>; + tts: Record>; + } = { + vision: { + openai: { + 'gpt-4o': { input: 0.0025, output: 0.01 }, + 'gpt-5.4-mini': { input: 0.00015, output: 0.0006 }, + 'gpt-4o-mini': { input: 0.00015, output: 0.0006 } + }, + gemini: { + 'gemini-2.0-flash': { input: 0.0001, output: 0.0004 }, + 'gemini-1.5-flash': { input: 0.000075, output: 0.0003 }, + 'gemini-1.5-pro': { input: 0.00125, output: 0.005 } + }, + openrouter: { + 'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 }, + 'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 }, + 'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 }, + 'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 } + } + }, + tts: { + openai: { + 'tts-1': 0.015, + 'tts-1-hd': 0.030, + 'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 } + }, + elevenlabs: { + 'eleven_multilingual_v2': 0.30, + 'eleven_turbo_v2.5': 0.015 + }, + google: { + 'chirp-hd': 0.016, + 'wavenet': 0.016, + 'neural2': 0.016, + 'standard': 0.004 + } + } + }; + + // Get the pricing for the selected providers + const visionProvider = settings.visionProvider; + const visionModel = settings.visionProviders[visionProvider].model; + const ttsProvider = settings.ttsProvider; + const ttsModel = settings.ttsProviders[ttsProvider].model; + + // Check if the pricing data exists + const visionPricing = pricing.vision[visionProvider]?.[visionModel]; + const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel]; + + if (!visionPricing) { + console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`); + } + + if (!ttsPricing) { + console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`); + } + + // Estimated token counts + const estimatedVisionInputTokens = 1000 * unitCostMultiplier; + const estimatedPromptTokens = 100; + const estimatedOutputTokensPerUnit = 75; + + // Estimated character counts for TTS + const estimatedCharsPerDescription = 200; + + // Calculate estimated costs for first unit + const firstUnitCost = { + visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, + visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, + tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing) + }; + + // For subsequent units, we need context (e.g., previous frames) + const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; + + const subsequentUnitCost = { + visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, + visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, + tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing) + }; + + // Calculate total costs + const totalVisionInputCost = + firstUnitCost.visionInput + + (totalUnits - 1) * subsequentUnitCost.visionInput; + + const totalVisionOutputCost = + firstUnitCost.visionOutput + + (totalUnits - 1) * subsequentUnitCost.visionOutput; + + const totalTTSCost = + firstUnitCost.tts + + (totalUnits - 1) * subsequentUnitCost.tts; + + const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost; + + // Create cost breakdown + const costBreakdown: CostBreakdown = { + videoInfo: { + duration: videoDuration, + totalUnits: totalUnits, + unitType: unitType, + processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds + }, + providerInfo: { + visionProvider: visionProvider, + visionModel: visionModel, + ttsProvider: ttsProvider, + ttsModel: ttsModel + }, + apiCosts: { + visionInput: totalVisionInputCost.toFixed(4), + visionOutput: totalVisionOutputCost.toFixed(4), + tts: totalTTSCost.toFixed(4), + total: totalCost.toFixed(4) + }, + estimates: { + totalAPICallsToProviders: totalUnits * 2, + estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 + } + }; + + return costBreakdown; +} + +function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number { + if (!pricing) return 0; + + if (typeof pricing === 'number') { + // Per-character pricing: cost per 1000 characters + return charCount * pricing / 1000; + } + + // Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens + // Rough estimate: 1 char ≈ 0.25 tokens for English text + const estimatedInputTokens = charCount * 0.25; + const estimatedOutputTokens = charCount * 3; // audio output is token-heavy + return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000; +}