Update cost estimator with all new models and per-token TTS pricing support
This commit is contained in:
@@ -1,150 +1,182 @@
|
||||
import { Config } from '../config/config';
|
||||
import { CostBreakdown } from '../interfaces';
|
||||
import { getVideoDuration } from './mediaUtils';
|
||||
|
||||
/**
|
||||
* Estimate the cost of generating audio descriptions for a video
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Cost estimation breakdown
|
||||
*/
|
||||
export async function estimateCost(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<CostBreakdown> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// Calculate the number of frames or batches to process
|
||||
let totalUnits: number;
|
||||
let unitCostMultiplier: number;
|
||||
let unitType: string;
|
||||
|
||||
if (settings.batchTimeMode) {
|
||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
|
||||
unitType = "batches";
|
||||
} else {
|
||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
unitCostMultiplier = 1; // No multiplier for normal mode
|
||||
unitType = "frames";
|
||||
}
|
||||
|
||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||
|
||||
// Pricing constants (as of March 2025, update as needed)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, number>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': {
|
||||
input: 0.0025,
|
||||
output: 0.01
|
||||
}
|
||||
},
|
||||
gemini: {
|
||||
'gemini-pro-vision': {
|
||||
input: 0.0025,
|
||||
output: 0.0025
|
||||
}
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Estimated token counts
|
||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
|
||||
const estimatedPromptTokens = 100; // Tokens for the prompt text
|
||||
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
|
||||
|
||||
// Estimated character counts for TTS
|
||||
const estimatedCharsPerDescription = 200; // Average characters per description
|
||||
|
||||
// Calculate estimated costs for first unit
|
||||
const firstUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
||||
};
|
||||
|
||||
// For subsequent units, we need context (e.g., previous frames)
|
||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
|
||||
|
||||
const subsequentUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
||||
};
|
||||
|
||||
// Calculate total costs
|
||||
const totalVisionInputCost =
|
||||
firstUnitCost.visionInput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionInput;
|
||||
|
||||
const totalVisionOutputCost =
|
||||
firstUnitCost.visionOutput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionOutput;
|
||||
|
||||
const totalTTSCost =
|
||||
firstUnitCost.tts +
|
||||
(totalUnits - 1) * subsequentUnitCost.tts;
|
||||
|
||||
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||||
|
||||
// Create cost breakdown
|
||||
const costBreakdown: CostBreakdown = {
|
||||
videoInfo: {
|
||||
duration: videoDuration,
|
||||
totalUnits: totalUnits,
|
||||
unitType: unitType,
|
||||
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
|
||||
},
|
||||
providerInfo: {
|
||||
visionProvider: visionProvider,
|
||||
visionModel: visionModel,
|
||||
ttsProvider: ttsProvider,
|
||||
ttsModel: ttsModel
|
||||
},
|
||||
apiCosts: {
|
||||
visionInput: totalVisionInputCost.toFixed(4),
|
||||
visionOutput: totalVisionOutputCost.toFixed(4),
|
||||
tts: totalTTSCost.toFixed(4),
|
||||
total: totalCost.toFixed(4)
|
||||
},
|
||||
estimates: {
|
||||
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
|
||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
|
||||
}
|
||||
};
|
||||
|
||||
return costBreakdown;
|
||||
}
|
||||
import { Config } from '../config/config';
|
||||
import { CostBreakdown } from '../interfaces';
|
||||
import { getVideoDuration } from './mediaUtils';
|
||||
|
||||
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
|
||||
|
||||
/**
|
||||
* Estimate the cost of generating audio descriptions for a video
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Cost estimation breakdown
|
||||
*/
|
||||
export async function estimateCost(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<CostBreakdown> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// Calculate the number of frames or batches to process
|
||||
let totalUnits: number;
|
||||
let unitCostMultiplier: number;
|
||||
let unitType: string;
|
||||
|
||||
if (settings.batchTimeMode) {
|
||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
unitCostMultiplier = settings.framesInBatch;
|
||||
unitType = "batches";
|
||||
} else {
|
||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
unitCostMultiplier = 1;
|
||||
unitType = "frames";
|
||||
}
|
||||
|
||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||
|
||||
// Pricing constants (per 1K units unless otherwise noted)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, TTSPricingModel>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': { input: 0.0025, output: 0.01 },
|
||||
'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
|
||||
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
|
||||
},
|
||||
gemini: {
|
||||
'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
|
||||
'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
|
||||
'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
|
||||
},
|
||||
openrouter: {
|
||||
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
|
||||
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030,
|
||||
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
|
||||
},
|
||||
elevenlabs: {
|
||||
'eleven_multilingual_v2': 0.30,
|
||||
'eleven_turbo_v2.5': 0.015
|
||||
},
|
||||
google: {
|
||||
'chirp-hd': 0.016,
|
||||
'wavenet': 0.016,
|
||||
'neural2': 0.016,
|
||||
'standard': 0.004
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Estimated token counts
|
||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
|
||||
const estimatedPromptTokens = 100;
|
||||
const estimatedOutputTokensPerUnit = 75;
|
||||
|
||||
// Estimated character counts for TTS
|
||||
const estimatedCharsPerDescription = 200;
|
||||
|
||||
// Calculate estimated costs for first unit
|
||||
const firstUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// For subsequent units, we need context (e.g., previous frames)
|
||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
|
||||
|
||||
const subsequentUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// Calculate total costs
|
||||
const totalVisionInputCost =
|
||||
firstUnitCost.visionInput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionInput;
|
||||
|
||||
const totalVisionOutputCost =
|
||||
firstUnitCost.visionOutput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionOutput;
|
||||
|
||||
const totalTTSCost =
|
||||
firstUnitCost.tts +
|
||||
(totalUnits - 1) * subsequentUnitCost.tts;
|
||||
|
||||
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||||
|
||||
// Create cost breakdown
|
||||
const costBreakdown: CostBreakdown = {
|
||||
videoInfo: {
|
||||
duration: videoDuration,
|
||||
totalUnits: totalUnits,
|
||||
unitType: unitType,
|
||||
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
|
||||
},
|
||||
providerInfo: {
|
||||
visionProvider: visionProvider,
|
||||
visionModel: visionModel,
|
||||
ttsProvider: ttsProvider,
|
||||
ttsModel: ttsModel
|
||||
},
|
||||
apiCosts: {
|
||||
visionInput: totalVisionInputCost.toFixed(4),
|
||||
visionOutput: totalVisionOutputCost.toFixed(4),
|
||||
tts: totalTTSCost.toFixed(4),
|
||||
total: totalCost.toFixed(4)
|
||||
},
|
||||
estimates: {
|
||||
totalAPICallsToProviders: totalUnits * 2,
|
||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
|
||||
}
|
||||
};
|
||||
|
||||
return costBreakdown;
|
||||
}
|
||||
|
||||
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
|
||||
if (!pricing) return 0;
|
||||
|
||||
if (typeof pricing === 'number') {
|
||||
// Per-character pricing: cost per 1000 characters
|
||||
return charCount * pricing / 1000;
|
||||
}
|
||||
|
||||
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
|
||||
// Rough estimate: 1 char ≈ 0.25 tokens for English text
|
||||
const estimatedInputTokens = charCount * 0.25;
|
||||
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
|
||||
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user