Update cost estimator with all new models and per-token TTS pricing support

2026-05-13 02:44:30 +02:00
parent f05e57493c
commit 3a198d7d50
1 changed files with 182 additions and 150 deletions
--- a/src/utils/costEstimator.ts
+++ b/src/utils/costEstimator.ts
@@ -2,6 +2,8 @@ import { Config } from '../config/config';
 import { CostBreakdown } from '../interfaces';
 import { getVideoDuration } from './mediaUtils';

+type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
+
 /**
 * Estimate the cost of generating audio descriptions for a video
 * @param videoFilePath - Path to the input video file
@@ -26,39 +28,54 @@ export async function estimateCost(
  
  if (settings.batchTimeMode) {
    totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
-    unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
+    unitCostMultiplier = settings.framesInBatch;
    unitType = "batches";
  } else {
    totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
-    unitCostMultiplier = 1; // No multiplier for normal mode
+    unitCostMultiplier = 1;
    unitType = "frames";
  }
  
  console.log(`Will process ${totalUnits} ${unitType}`);

-  // Pricing constants (as of March 2025, update as needed)
+  // Pricing constants (per 1K units unless otherwise noted)
  const pricing: {
    vision: Record<string, Record<string, { input: number; output: number }>>;
-    tts: Record<string, Record<string, number>>;
+    tts: Record<string, Record<string, TTSPricingModel>>;
  } = {
    vision: {
      openai: {
-        'gpt-4o': {
-          input: 0.0025,
-          output: 0.01
-        }
+        'gpt-4o': { input: 0.0025, output: 0.01 },
+        'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
+        'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
      },
      gemini: {
-        'gemini-pro-vision': {
-          input: 0.0025,
-          output: 0.0025
-        }
+        'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
+        'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
+        'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
+      },
+      openrouter: {
+        'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
+        'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
+        'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
+        'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
      }
    },
    tts: {
      openai: {
        'tts-1': 0.015,
-        'tts-1-hd': 0.030
+        'tts-1-hd': 0.030,
+        'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
+      },
+      elevenlabs: {
+        'eleven_multilingual_v2': 0.30,
+        'eleven_turbo_v2.5': 0.015
+      },
+      google: {
+        'chirp-hd': 0.016,
+        'wavenet': 0.016,
+        'neural2': 0.016,
+        'standard': 0.004
      }
    }
  };
@@ -82,27 +99,27 @@ export async function estimateCost(
  }

  // Estimated token counts
-  const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
-  const estimatedPromptTokens = 100; // Tokens for the prompt text
-  const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
+  const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
+  const estimatedPromptTokens = 100;
+  const estimatedOutputTokensPerUnit = 75;

  // Estimated character counts for TTS
-  const estimatedCharsPerDescription = 200; // Average characters per description
+  const estimatedCharsPerDescription = 200;

  // Calculate estimated costs for first unit
  const firstUnitCost = {
    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
-    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
+    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
  };

  // For subsequent units, we need context (e.g., previous frames)
-  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
+  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
  
  const subsequentUnitCost = {
    visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
-    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
+    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
  };

  // Calculate total costs
@@ -141,10 +158,25 @@ export async function estimateCost(
      total: totalCost.toFixed(4)
    },
    estimates: {
-      totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
-      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
+      totalAPICallsToProviders: totalUnits * 2,
+      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
    }
  };

  return costBreakdown;
 }
+
+function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
+  if (!pricing) return 0;
+
+  if (typeof pricing === 'number') {
+    // Per-character pricing: cost per 1000 characters
+    return charCount * pricing / 1000;
+  }
+
+  // Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
+  // Rough estimate: 1 char ≈ 0.25 tokens for English text
+  const estimatedInputTokens = charCount * 0.25;
+  const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
+  return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
+}