aidio-description/src/utils/costEstimator.ts

import { Config } from '../config/config';
import { CostBreakdown } from '../interfaces';
import { getVideoDuration } from './mediaUtils';

type TTSPricingModel = number | { inputTokens: number; outputTokens: number };

/**
 * Estimate the cost of generating audio descriptions for a video
 * @param videoFilePath - Path to the input video file
 * @param options - Optional configuration overrides
 * @returns Cost estimation breakdown
 */
export async function estimateCost(
  videoFilePath: string,
  options: Partial<Config> = {}
): Promise<CostBreakdown> {
  // Merge provided options with defaults
  const settings = { ...options } as Config;

  // Get video duration
  const videoDuration = getVideoDuration(videoFilePath);
  console.log(`Video duration: ${videoDuration} seconds`);

  // Calculate the number of frames or batches to process
  let totalUnits: number;
  let unitCostMultiplier: number;
  let unitType: string;

  if (settings.batchTimeMode) {
    totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
    unitCostMultiplier = settings.framesInBatch;
    unitType = "batches";
  } else {
    totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
    unitCostMultiplier = 1;
    unitType = "frames";
  }

  console.log(`Will process ${totalUnits} ${unitType}`);

  // Pricing constants (per 1K units unless otherwise noted)
  const pricing: {
    vision: Record<string, Record<string, { input: number; output: number }>>;
    tts: Record<string, Record<string, TTSPricingModel>>;
  } = {
    vision: {
      openai: {
        'gpt-4o': { input: 0.0025, output: 0.01 },
        'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
        'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
      },
      gemini: {
        'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
        'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
        'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
      },
      openrouter: {
        'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
        'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
        'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
        'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
      }
    },
    tts: {
      openai: {
        'tts-1': 0.015,
        'tts-1-hd': 0.030,
        'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
      },
      elevenlabs: {
        'eleven_multilingual_v2': 0.30,
        'eleven_turbo_v2.5': 0.015
      },
      google: {
        'chirp-hd': 0.016,
        'wavenet': 0.016,
        'neural2': 0.016,
        'standard': 0.004
      }
    }
  };

  // Get the pricing for the selected providers
  const visionProvider = settings.visionProvider;
  const visionModel = settings.visionProviders[visionProvider].model;
  const ttsProvider = settings.ttsProvider;
  const ttsModel = settings.ttsProviders[ttsProvider].model;

  // Check if the pricing data exists
  const visionPricing = pricing.vision[visionProvider]?.[visionModel];
  const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];

  if (!visionPricing) {
    console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
  }

  if (!ttsPricing) {
    console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
  }

  // Estimated token counts
  const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
  const estimatedPromptTokens = 100;
  const estimatedOutputTokensPerUnit = 75;

  // Estimated character counts for TTS
  const estimatedCharsPerDescription = 200;

  // Calculate estimated costs for first unit
  const firstUnitCost = {
    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
  };

  // For subsequent units, we need context (e.g., previous frames)
  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;

  const subsequentUnitCost = {
    visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
  };

  // Calculate total costs
  const totalVisionInputCost =
    firstUnitCost.visionInput +
    (totalUnits - 1) * subsequentUnitCost.visionInput;

  const totalVisionOutputCost =
    firstUnitCost.visionOutput +
    (totalUnits - 1) * subsequentUnitCost.visionOutput;

  const totalTTSCost =
    firstUnitCost.tts +
    (totalUnits - 1) * subsequentUnitCost.tts;

  const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;

  // Create cost breakdown
  const costBreakdown: CostBreakdown = {
    videoInfo: {
      duration: videoDuration,
      totalUnits: totalUnits,
      unitType: unitType,
      processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
    },
    providerInfo: {
      visionProvider: visionProvider,
      visionModel: visionModel,
      ttsProvider: ttsProvider,
      ttsModel: ttsModel
    },
    apiCosts: {
      visionInput: totalVisionInputCost.toFixed(4),
      visionOutput: totalVisionOutputCost.toFixed(4),
      tts: totalTTSCost.toFixed(4),
      total: totalCost.toFixed(4)
    },
    estimates: {
      totalAPICallsToProviders: totalUnits * 2,
      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
    }
  };

  return costBreakdown;
}

function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
  if (!pricing) return 0;

  if (typeof pricing === 'number') {
    // Per-character pricing: cost per 1000 characters
    return charCount * pricing / 1000;
  }

  // Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
  // Rough estimate: 1 char ≈ 0.25 tokens for English text
  const estimatedInputTokens = charCount * 0.25;
  const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
  return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
}