Update cost estimator with all new models and per-token TTS pricing support

This commit is contained in:
2026-05-13 02:44:30 +02:00
parent f05e57493c
commit 3a198d7d50

View File

@@ -2,6 +2,8 @@ import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces'; import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils'; import { getVideoDuration } from './mediaUtils';
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
/** /**
* Estimate the cost of generating audio descriptions for a video * Estimate the cost of generating audio descriptions for a video
* @param videoFilePath - Path to the input video file * @param videoFilePath - Path to the input video file
@@ -26,39 +28,54 @@ export async function estimateCost(
if (settings.batchTimeMode) { if (settings.batchTimeMode) {
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration); totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode unitCostMultiplier = settings.framesInBatch;
unitType = "batches"; unitType = "batches";
} else { } else {
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds); totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
unitCostMultiplier = 1; // No multiplier for normal mode unitCostMultiplier = 1;
unitType = "frames"; unitType = "frames";
} }
console.log(`Will process ${totalUnits} ${unitType}`); console.log(`Will process ${totalUnits} ${unitType}`);
// Pricing constants (as of March 2025, update as needed) // Pricing constants (per 1K units unless otherwise noted)
const pricing: { const pricing: {
vision: Record<string, Record<string, { input: number; output: number }>>; vision: Record<string, Record<string, { input: number; output: number }>>;
tts: Record<string, Record<string, number>>; tts: Record<string, Record<string, TTSPricingModel>>;
} = { } = {
vision: { vision: {
openai: { openai: {
'gpt-4o': { 'gpt-4o': { input: 0.0025, output: 0.01 },
input: 0.0025, 'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
output: 0.01 'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
}
}, },
gemini: { gemini: {
'gemini-pro-vision': { 'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
input: 0.0025, 'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
output: 0.0025 'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
} },
openrouter: {
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
} }
}, },
tts: { tts: {
openai: { openai: {
'tts-1': 0.015, 'tts-1': 0.015,
'tts-1-hd': 0.030 'tts-1-hd': 0.030,
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
},
elevenlabs: {
'eleven_multilingual_v2': 0.30,
'eleven_turbo_v2.5': 0.015
},
google: {
'chirp-hd': 0.016,
'wavenet': 0.016,
'neural2': 0.016,
'standard': 0.004
} }
} }
}; };
@@ -82,27 +99,27 @@ export async function estimateCost(
} }
// Estimated token counts // Estimated token counts
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
const estimatedPromptTokens = 100; // Tokens for the prompt text const estimatedPromptTokens = 100;
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output const estimatedOutputTokensPerUnit = 75;
// Estimated character counts for TTS // Estimated character counts for TTS
const estimatedCharsPerDescription = 200; // Average characters per description const estimatedCharsPerDescription = 200;
// Calculate estimated costs for first unit // Calculate estimated costs for first unit
const firstUnitCost = { const firstUnitCost = {
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
}; };
// For subsequent units, we need context (e.g., previous frames) // For subsequent units, we need context (e.g., previous frames)
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
const subsequentUnitCost = { const subsequentUnitCost = {
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000 tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
}; };
// Calculate total costs // Calculate total costs
@@ -141,10 +158,25 @@ export async function estimateCost(
total: totalCost.toFixed(4) total: totalCost.toFixed(4)
}, },
estimates: { estimates: {
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit totalAPICallsToProviders: totalUnits * 2,
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
} }
}; };
return costBreakdown; return costBreakdown;
} }
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
if (!pricing) return 0;
if (typeof pricing === 'number') {
// Per-character pricing: cost per 1000 characters
return charCount * pricing / 1000;
}
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
// Rough estimate: 1 char ≈ 0.25 tokens for English text
const estimatedInputTokens = charCount * 0.25;
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
}