Update cost estimator with all new models and per-token TTS pricing support

This commit is contained in:
2026-05-13 02:44:30 +02:00
parent f05e57493c
commit 3a198d7d50

View File

@@ -1,150 +1,182 @@
import { Config } from '../config/config'; import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces'; import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils'; import { getVideoDuration } from './mediaUtils';
/** type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
* Estimate the cost of generating audio descriptions for a video
* @param videoFilePath - Path to the input video file /**
* @param options - Optional configuration overrides * Estimate the cost of generating audio descriptions for a video
* @returns Cost estimation breakdown * @param videoFilePath - Path to the input video file
*/ * @param options - Optional configuration overrides
export async function estimateCost( * @returns Cost estimation breakdown
videoFilePath: string, */
options: Partial<Config> = {} export async function estimateCost(
): Promise<CostBreakdown> { videoFilePath: string,
// Merge provided options with defaults options: Partial<Config> = {}
const settings = { ...options } as Config; ): Promise<CostBreakdown> {
// Merge provided options with defaults
// Get video duration const settings = { ...options } as Config;
const videoDuration = getVideoDuration(videoFilePath);
console.log(`Video duration: ${videoDuration} seconds`); // Get video duration
const videoDuration = getVideoDuration(videoFilePath);
// Calculate the number of frames or batches to process console.log(`Video duration: ${videoDuration} seconds`);
let totalUnits: number;
let unitCostMultiplier: number; // Calculate the number of frames or batches to process
let unitType: string; let totalUnits: number;
let unitCostMultiplier: number;
if (settings.batchTimeMode) { let unitType: string;
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode if (settings.batchTimeMode) {
unitType = "batches"; totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
} else { unitCostMultiplier = settings.framesInBatch;
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds); unitType = "batches";
unitCostMultiplier = 1; // No multiplier for normal mode } else {
unitType = "frames"; totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
} unitCostMultiplier = 1;
unitType = "frames";
console.log(`Will process ${totalUnits} ${unitType}`); }
// Pricing constants (as of March 2025, update as needed) console.log(`Will process ${totalUnits} ${unitType}`);
const pricing: {
vision: Record<string, Record<string, { input: number; output: number }>>; // Pricing constants (per 1K units unless otherwise noted)
tts: Record<string, Record<string, number>>; const pricing: {
} = { vision: Record<string, Record<string, { input: number; output: number }>>;
vision: { tts: Record<string, Record<string, TTSPricingModel>>;
openai: { } = {
'gpt-4o': { vision: {
input: 0.0025, openai: {
output: 0.01 'gpt-4o': { input: 0.0025, output: 0.01 },
} 'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
}, 'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
gemini: { },
'gemini-pro-vision': { gemini: {
input: 0.0025, 'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
output: 0.0025 'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
} 'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
} },
}, openrouter: {
tts: { 'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
openai: { 'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
'tts-1': 0.015, 'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
'tts-1-hd': 0.030 'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
} }
} },
}; tts: {
openai: {
// Get the pricing for the selected providers 'tts-1': 0.015,
const visionProvider = settings.visionProvider; 'tts-1-hd': 0.030,
const visionModel = settings.visionProviders[visionProvider].model; 'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
const ttsProvider = settings.ttsProvider; },
const ttsModel = settings.ttsProviders[ttsProvider].model; elevenlabs: {
'eleven_multilingual_v2': 0.30,
// Check if the pricing data exists 'eleven_turbo_v2.5': 0.015
const visionPricing = pricing.vision[visionProvider]?.[visionModel]; },
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel]; google: {
'chirp-hd': 0.016,
if (!visionPricing) { 'wavenet': 0.016,
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`); 'neural2': 0.016,
} 'standard': 0.004
}
if (!ttsPricing) { }
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`); };
}
// Get the pricing for the selected providers
// Estimated token counts const visionProvider = settings.visionProvider;
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input const visionModel = settings.visionProviders[visionProvider].model;
const estimatedPromptTokens = 100; // Tokens for the prompt text const ttsProvider = settings.ttsProvider;
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output const ttsModel = settings.ttsProviders[ttsProvider].model;
// Estimated character counts for TTS // Check if the pricing data exists
const estimatedCharsPerDescription = 200; // Average characters per description const visionPricing = pricing.vision[visionProvider]?.[visionModel];
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
// Calculate estimated costs for first unit
const firstUnitCost = { if (!visionPricing) {
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, }
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
}; if (!ttsPricing) {
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
// For subsequent units, we need context (e.g., previous frames) }
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
// Estimated token counts
const subsequentUnitCost = { const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000, const estimatedPromptTokens = 100;
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000, const estimatedOutputTokensPerUnit = 75;
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
}; // Estimated character counts for TTS
const estimatedCharsPerDescription = 200;
// Calculate total costs
const totalVisionInputCost = // Calculate estimated costs for first unit
firstUnitCost.visionInput + const firstUnitCost = {
(totalUnits - 1) * subsequentUnitCost.visionInput; visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
const totalVisionOutputCost = tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
firstUnitCost.visionOutput + };
(totalUnits - 1) * subsequentUnitCost.visionOutput;
// For subsequent units, we need context (e.g., previous frames)
const totalTTSCost = const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
firstUnitCost.tts +
(totalUnits - 1) * subsequentUnitCost.tts; const subsequentUnitCost = {
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost; visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
// Create cost breakdown };
const costBreakdown: CostBreakdown = {
videoInfo: { // Calculate total costs
duration: videoDuration, const totalVisionInputCost =
totalUnits: totalUnits, firstUnitCost.visionInput +
unitType: unitType, (totalUnits - 1) * subsequentUnitCost.visionInput;
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
}, const totalVisionOutputCost =
providerInfo: { firstUnitCost.visionOutput +
visionProvider: visionProvider, (totalUnits - 1) * subsequentUnitCost.visionOutput;
visionModel: visionModel,
ttsProvider: ttsProvider, const totalTTSCost =
ttsModel: ttsModel firstUnitCost.tts +
}, (totalUnits - 1) * subsequentUnitCost.tts;
apiCosts: {
visionInput: totalVisionInputCost.toFixed(4), const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
visionOutput: totalVisionOutputCost.toFixed(4),
tts: totalTTSCost.toFixed(4), // Create cost breakdown
total: totalCost.toFixed(4) const costBreakdown: CostBreakdown = {
}, videoInfo: {
estimates: { duration: videoDuration,
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit totalUnits: totalUnits,
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit unitType: unitType,
} processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
}; },
providerInfo: {
return costBreakdown; visionProvider: visionProvider,
} visionModel: visionModel,
ttsProvider: ttsProvider,
ttsModel: ttsModel
},
apiCosts: {
visionInput: totalVisionInputCost.toFixed(4),
visionOutput: totalVisionOutputCost.toFixed(4),
tts: totalTTSCost.toFixed(4),
total: totalCost.toFixed(4)
},
estimates: {
totalAPICallsToProviders: totalUnits * 2,
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
}
};
return costBreakdown;
}
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
if (!pricing) return 0;
if (typeof pricing === 'number') {
// Per-character pricing: cost per 1000 characters
return charCount * pricing / 1000;
}
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
// Rough estimate: 1 char ≈ 0.25 tokens for English text
const estimatedInputTokens = charCount * 0.25;
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
}