Update cost estimator with all new models and per-token TTS pricing support

Add Google Cloud Chirp 3 TTS provider with service account support
Add OpenRouter vision provider for multi-model access via unified API
2026-05-13 02:44:30 +02:00 · 2026-05-13 02:42:54 +02:00 · 2026-05-13 02:40:03 +02:00 · 2026-05-13 02:38:16 +02:00 · 2026-05-13 02:36:46 +02:00
16 changed files with 1478 additions and 169 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -19,6 +19,7 @@
    "prepublishOnly": "npm run build"
  },
  "dependencies": {
+    "@google-cloud/text-to-speech": "^6.4.1",
    "@google/generative-ai": "^0.24.0",
    "axios": "^1.6.2",
    "dotenv": "^16.3.1",
@@ -51,4 +52,4 @@
  ],
  "author": "",
  "license": "MIT"
-}
+}
--- a/src/cli/args.ts
+++ b/src/cli/args.ts
@@ -13,6 +13,7 @@ export interface CLIArgs {
  ttsModel?: string;
  ttsVoice?: string;
  ttsSpeedFactor?: number;
+  ttsInstructions?: string;
  outputDir?: string;
  tempDir?: string;
  batchTimeMode?: boolean;
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
      describe: 'Speed factor for the audio playback',
      type: 'number'
    })
+    .option('ttsInstructions', {
+      describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
+      type: 'string'
+    })
    .option('outputDir', {
      alias: 'o',
      describe: 'Directory for output files',
--- a/src/config/config.ts
+++ b/src/config/config.ts
@@ -18,6 +18,7 @@ export interface Config {
  ttsProvider: string;
  ttsVoice: string;
  ttsSpeedFactor: number;
+  ttsInstructions?: string;
  ttsProviders: {
    [key: string]: TTSProviderConfig;
  };
@@ -61,6 +62,12 @@ export function getDefaultConfig(): Config {
        baseUrl: "http://localhost:11434",
        model: "gemma3:12b",
        maxTokens: 3000
+      },
+      openrouter: {
+        apiKey: process.env.OPENROUTER_API_KEY,
+        model: "anthropic/claude-sonnet-4.5",
+        baseUrl: "https://openrouter.ai/api/v1",
+        maxTokens: 300
      }
    },
    
@@ -68,11 +75,23 @@ export function getDefaultConfig(): Config {
    ttsProvider: "openai",
    ttsVoice: "alloy",
    ttsSpeedFactor: 1.5,
+    ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
    ttsProviders: {
      openai: {
        apiKey: process.env.OPENAI_API_KEY,
-        model: "tts-1-hd",
+        model: "gpt-4o-mini-tts",
        voice: "alloy"
+      },
+      elevenlabs: {
+        apiKey: process.env.ELEVENLABS_API_KEY,
+        model: "eleven_multilingual_v2",
+        voice: "JBFqnCBsd6RMkjVDRZzb"
+      },
+      google: {
+        apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
+        keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
+        model: "chirp-hd",
+        voice: "en-US-Chirp-HD-F"
      }
    },
    
--- a/src/index.ts
+++ b/src/index.ts
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
    config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
  }

+  if (argv.ttsInstructions) {
+    config.ttsInstructions = argv.ttsInstructions;
+  }
+
  if (argv.saveConfig) {
    saveConfigToFile(argv.saveConfig, config);
  }
--- a/src/interfaces/index.ts
+++ b/src/interfaces/index.ts
@@ -39,12 +39,14 @@ export interface TTSOptions {
  voice?: string;
  model?: string;
  speedFactor?: number;
+  instructions?: string;
 }

 export interface TTSProviderConfig {
  apiKey?: string;
  model: string;
  voice?: string;
+  keyFilename?: string;
 }

 export interface TTSProvider {
--- a/src/providers/tts/elevenLabsTTSProvider.ts
+++ b/src/providers/tts/elevenLabsTTSProvider.ts
@@ -0,0 +1,93 @@
+import fs from 'fs';
+import { execSync } from 'child_process';
+import axios, { AxiosInstance } from 'axios';
+import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
+import { getAudioDuration } from '../../utils/mediaUtils';
+
+export class ElevenLabsTTSProvider implements TTSProvider {
+  private config: TTSProviderConfig;
+  private axiosInstance: AxiosInstance;
+  private lastRequestId: string | null = null;
+
+  constructor(config: TTSProviderConfig) {
+    this.config = config;
+    this.axiosInstance = axios.create({
+      baseURL: 'https://api.elevenlabs.io/v1',
+      headers: {
+        'xi-api-key': config.apiKey,
+        'Content-Type': 'application/json'
+      }
+    });
+  }
+
+  async textToSpeech(
+    text: string,
+    outputPath: string,
+    options: TTSOptions = {}
+  ): Promise<TTSResult> {
+    try {
+      const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
+      const model = options.model || this.config.model || 'eleven_multilingual_v2';
+      const speedFactor = options.speedFactor || 1.0;
+
+      const requestBody: any = {
+        text,
+        model_id: model,
+        voice_settings: {
+          stability: 0.5,
+          similarity_boost: 0.75,
+          speed: speedFactor,
+          use_speaker_boost: true
+        }
+      };
+
+      if (this.lastRequestId) {
+        requestBody.previous_request_ids = [this.lastRequestId];
+      }
+
+      const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
+
+      const response = await this.axiosInstance.post(
+        `/text-to-speech/${voice}`,
+        requestBody,
+        {
+          params: { output_format: 'mp3_44100_128' },
+          responseType: 'arraybuffer'
+        }
+      );
+
+      this.lastRequestId = response.headers['request-id'] || null;
+
+      const audioBuffer = Buffer.from(response.data);
+      fs.writeFileSync(tempOutputPath, audioBuffer);
+
+      const cost = text.length;
+
+      if (speedFactor !== 1.0) {
+        execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
+        fs.unlinkSync(tempOutputPath);
+      } else {
+        fs.renameSync(tempOutputPath, outputPath);
+      }
+
+      const audioDuration = getAudioDuration(outputPath);
+
+      return {
+        duration: audioDuration,
+        cost: cost
+      };
+    } catch (error: any) {
+      if (error.response) {
+        console.error(`ElevenLabs TTS error (${error.response.status}):`, 
+          Buffer.from(error.response.data).toString());
+      } else {
+        console.error('ElevenLabs TTS error:', error.message);
+      }
+      execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
+      return {
+        duration: 1,
+        cost: 0
+      };
+    }
+  }
+}
--- a/src/providers/tts/googleCloudTTSProvider.ts
+++ b/src/providers/tts/googleCloudTTSProvider.ts
@@ -0,0 +1,94 @@
+import fs from 'fs';
+import { execSync } from 'child_process';
+import { TextToSpeechClient } from '@google-cloud/text-to-speech';
+import { google } from '@google-cloud/text-to-speech/build/protos/protos';
+import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
+import { getAudioDuration } from '../../utils/mediaUtils';
+
+export class GoogleCloudTTSProvider implements TTSProvider {
+  private config: TTSProviderConfig;
+  private client: TextToSpeechClient;
+
+  constructor(config: TTSProviderConfig) {
+    this.config = config;
+
+    const clientConfig: any = {
+      apiKey: config.apiKey,
+      fallback: true
+    };
+
+    if (config.keyFilename) {
+      clientConfig.keyFilename = config.keyFilename;
+    }
+
+    this.client = new TextToSpeechClient(clientConfig);
+  }
+
+  async textToSpeech(
+    text: string,
+    outputPath: string,
+    options: TTSOptions = {}
+  ): Promise<TTSResult> {
+    try {
+      const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
+      const model = options.model || this.config.model || 'chirp-hd';
+      const speedFactor = options.speedFactor || 1.0;
+
+      const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
+        input: { text },
+        voice: {
+          languageCode: this.extractLanguageCode(voice),
+          name: voice
+        },
+        audioConfig: {
+          audioEncoding: 'MP3',
+          speakingRate: speedFactor
+        }
+      };
+
+      const [response] = await this.client.synthesizeSpeech(request);
+
+      if (!response.audioContent) {
+        throw new Error('No audio content returned from Google Cloud TTS');
+      }
+
+      const audioBuffer = response.audioContent instanceof Uint8Array
+        ? Buffer.from(response.audioContent)
+        : Buffer.from(response.audioContent as any);
+
+      const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
+      fs.writeFileSync(tempOutputPath, audioBuffer);
+
+      const cost = text.length;
+
+      if (speedFactor !== 1.0) {
+        execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
+        fs.unlinkSync(tempOutputPath);
+      } else {
+        fs.renameSync(tempOutputPath, outputPath);
+      }
+
+      const audioDuration = getAudioDuration(outputPath);
+
+      return {
+        duration: audioDuration,
+        cost: cost
+      };
+    } catch (error: any) {
+      console.error('Google Cloud TTS error:', error.message);
+      execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
+      return {
+        duration: 1,
+        cost: 0
+      };
+    }
+  }
+
+  private extractLanguageCode(voiceName: string): string {
+    const parts = voiceName.split('-');
+    if (parts.length >= 2) {
+      return `${parts[0]}-${parts[1]}`;
+    }
+    return 'en-US';
+  }
+}
--- a/src/providers/tts/index.ts
+++ b/src/providers/tts/index.ts
@@ -1,2 +1,4 @@
 export * from './ttsProviderFactory';
-export * from './openAITTSProvider';
+export * from './openAITTSProvider';
+export * from './elevenLabsTTSProvider';
+export * from './googleCloudTTSProvider';
--- a/src/providers/tts/openAITTSProvider.ts
+++ b/src/providers/tts/openAITTSProvider.ts
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {

      const mp3 = await this.openai.audio.speech.create({
        model: model,
-        voice: voice as any, // Type casting to any to avoid type issues
-        input: text
+        voice: voice as any,
+        input: text,
+        ...(options.instructions ? { instructions: options.instructions } : {})
      });

      // Cost calculation is based on character count
--- a/src/providers/tts/ttsProviderFactory.ts
+++ b/src/providers/tts/ttsProviderFactory.ts
@@ -1,6 +1,8 @@
 import { TTSProvider } from '../../interfaces';
 import { Config } from '../../config/config';
 import { OpenAITTSProvider } from './openAITTSProvider';
+import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
+import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';

 /**
 * Factory for creating TTS providers
@@ -17,6 +19,10 @@ export class TTSProviderFactory {
    switch (providerName) {
      case 'openai':
        return new OpenAITTSProvider(providerConfig);
+      case 'elevenlabs':
+        return new ElevenLabsTTSProvider(providerConfig);
+      case 'google':
+        return new GoogleCloudTTSProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`TTS provider "${providerName}" not implemented.`);
--- a/src/providers/vision/index.ts
+++ b/src/providers/vision/index.ts
@@ -1,4 +1,5 @@
 export * from './visionProviderFactory';
 export * from './openAIVisionProvider';
 export * from './geminiVisionProvider';
-export * from './ollamaVisionProvider';
+export * from './ollamaVisionProvider';
+export * from './openRouterVisionProvider';
--- a/src/providers/vision/openRouterVisionProvider.ts
+++ b/src/providers/vision/openRouterVisionProvider.ts
@@ -0,0 +1,171 @@
+import fs from 'fs';
+import axios, { AxiosInstance } from 'axios';
+import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
+
+export class OpenRouterVisionProvider implements VisionProvider {
+  private config: VisionProviderConfig;
+  private axiosInstance: AxiosInstance;
+
+  constructor(config: VisionProviderConfig) {
+    this.config = config;
+    this.axiosInstance = axios.create({
+      baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
+      headers: {
+        'Authorization': `Bearer ${config.apiKey}`,
+        'Content-Type': 'application/json',
+        'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
+        'X-Title': 'Aidio Description Generator'
+      }
+    });
+  }
+
+  async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const base64Image = imageData.toString('base64');
+
+      const response = await this.axiosInstance.post('/chat/completions', {
+        model: this.config.model,
+        temperature: 0.1,
+        messages: [
+          {
+            role: 'user',
+            content: [
+              { type: 'text', text: prompt },
+              {
+                type: 'image_url',
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      const data = response.data;
+      return {
+        description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
+        usage: {
+          inputTokens: data.usage?.prompt_tokens || 0,
+          outputTokens: data.usage?.completion_tokens || 0,
+          totalTokens: data.usage?.total_tokens || 0
+        }
+      };
+    } catch (error: any) {
+      console.error('OpenRouter describeImage error:', error.response?.data || error.message);
+      return {
+        description: 'Unable to describe this image.',
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+      const base64Image1 = image1Data.toString('base64');
+      const base64Image2 = image2Data.toString('base64');
+
+      const response = await this.axiosInstance.post('/chat/completions', {
+        model: this.config.model,
+        temperature: 0.1,
+        messages: [
+          {
+            role: 'user',
+            content: [
+              { type: 'text', text: prompt },
+              {
+                type: 'image_url',
+                image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
+              },
+              {
+                type: 'image_url',
+                image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      const data = response.data;
+      return {
+        description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
+        usage: {
+          inputTokens: data.usage?.prompt_tokens || 0,
+          outputTokens: data.usage?.completion_tokens || 0,
+          totalTokens: data.usage?.total_tokens || 0
+        }
+      };
+    } catch (error: any) {
+      console.error('OpenRouter compareImages error:', error.response?.data || error.message);
+      return {
+        description: 'Unable to describe the differences between these images.',
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  async describeBatch(
+    imagePaths: string[],
+    lastBatchContext: BatchContext,
+    prompt: string
+  ): Promise<VisionResult> {
+    try {
+      const imagesBase64 = imagePaths.map(fp => {
+        const imageData = fs.readFileSync(fp);
+        return imageData.toString('base64');
+      });
+
+      const messages: any[] = [
+        {
+          role: 'user',
+          content: [
+            { type: 'text', text: prompt }
+          ]
+        }
+      ];
+
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        messages.unshift({
+          role: 'system',
+          content: `Previous batch summary: ${lastBatchContext.lastDescription}`
+        });
+      }
+
+      imagesBase64.forEach(base64 => {
+        messages[messages.length - 1].content.push({
+          type: 'image_url',
+          image_url: {
+            url: `data:image/jpeg;base64,${base64}`
+          }
+        });
+      });
+
+      const response = await this.axiosInstance.post('/chat/completions', {
+        model: this.config.model,
+        messages,
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      const data = response.data;
+      return {
+        description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
+        usage: {
+          inputTokens: data.usage?.prompt_tokens || 0,
+          outputTokens: data.usage?.completion_tokens || 0,
+          totalTokens: data.usage?.total_tokens || 0
+        }
+      };
+    } catch (error: any) {
+      console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
+      return {
+        description: 'Unable to describe this batch of images.',
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
--- a/src/providers/vision/visionProviderFactory.ts
+++ b/src/providers/vision/visionProviderFactory.ts
@@ -3,6 +3,7 @@ import { Config } from '../../config/config';
 import { OpenAIVisionProvider } from './openAIVisionProvider';
 import { GeminiVisionProvider } from './geminiVisionProvider';
 import { OllamaVisionProvider } from './ollamaVisionProvider';
+import { OpenRouterVisionProvider } from './openRouterVisionProvider';

 /**
 * Factory for creating vision AI providers
@@ -23,6 +24,8 @@ export class VisionProviderFactory {
        return new GeminiVisionProvider(providerConfig);
      case "ollama":
        return new OllamaVisionProvider(providerConfig);
+      case 'openrouter':
+        return new OpenRouterVisionProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`Vision provider "${providerName}" not implemented.`);
--- a/src/utils/costEstimator.ts
+++ b/src/utils/costEstimator.ts
@@ -1,150 +1,182 @@
-import { Config } from '../config/config';
-import { CostBreakdown } from '../interfaces';
-import { getVideoDuration } from './mediaUtils';
-
-/**
- * Estimate the cost of generating audio descriptions for a video
- * @param videoFilePath - Path to the input video file
- * @param options - Optional configuration overrides
- * @returns Cost estimation breakdown
- */
-export async function estimateCost(
-  videoFilePath: string, 
-  options: Partial<Config> = {}
-): Promise<CostBreakdown> {
-  // Merge provided options with defaults
-  const settings = { ...options } as Config;
-
-  // Get video duration
-  const videoDuration = getVideoDuration(videoFilePath);
-  console.log(`Video duration: ${videoDuration} seconds`);
-
-  // Calculate the number of frames or batches to process
-  let totalUnits: number;
-  let unitCostMultiplier: number;
-  let unitType: string;
-  
-  if (settings.batchTimeMode) {
-    totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
-    unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
-    unitType = "batches";
-  } else {
-    totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
-    unitCostMultiplier = 1; // No multiplier for normal mode
-    unitType = "frames";
-  }
-  
-  console.log(`Will process ${totalUnits} ${unitType}`);
-
-  // Pricing constants (as of March 2025, update as needed)
-  const pricing: {
-    vision: Record<string, Record<string, { input: number; output: number }>>;
-    tts: Record<string, Record<string, number>>;
-  } = {
-    vision: {
-      openai: {
-        'gpt-4o': {
-          input: 0.0025,
-          output: 0.01
-        }
-      },
-      gemini: {
-        'gemini-pro-vision': {
-          input: 0.0025,
-          output: 0.0025
-        }
-      }
-    },
-    tts: {
-      openai: {
-        'tts-1': 0.015,
-        'tts-1-hd': 0.030
-      }
-    }
-  };
-
-  // Get the pricing for the selected providers
-  const visionProvider = settings.visionProvider;
-  const visionModel = settings.visionProviders[visionProvider].model;
-  const ttsProvider = settings.ttsProvider;
-  const ttsModel = settings.ttsProviders[ttsProvider].model;
-
-  // Check if the pricing data exists
-  const visionPricing = pricing.vision[visionProvider]?.[visionModel];
-  const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
-
-  if (!visionPricing) {
-    console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
-  }
-
-  if (!ttsPricing) {
-    console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
-  }
-
-  // Estimated token counts
-  const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
-  const estimatedPromptTokens = 100; // Tokens for the prompt text
-  const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
-
-  // Estimated character counts for TTS
-  const estimatedCharsPerDescription = 200; // Average characters per description
-
-  // Calculate estimated costs for first unit
-  const firstUnitCost = {
-    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
-    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
-    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
-  };
-
-  // For subsequent units, we need context (e.g., previous frames)
-  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
-  
-  const subsequentUnitCost = {
-    visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
-    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
-    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
-  };
-
-  // Calculate total costs
-  const totalVisionInputCost =
-    firstUnitCost.visionInput +
-    (totalUnits - 1) * subsequentUnitCost.visionInput;
-
-  const totalVisionOutputCost =
-    firstUnitCost.visionOutput +
-    (totalUnits - 1) * subsequentUnitCost.visionOutput;
-
-  const totalTTSCost =
-    firstUnitCost.tts +
-    (totalUnits - 1) * subsequentUnitCost.tts;
-
-  const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
-
-  // Create cost breakdown
-  const costBreakdown: CostBreakdown = {
-    videoInfo: {
-      duration: videoDuration,
-      totalUnits: totalUnits,
-      unitType: unitType,
-      processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
-    },
-    providerInfo: {
-      visionProvider: visionProvider,
-      visionModel: visionModel,
-      ttsProvider: ttsProvider,
-      ttsModel: ttsModel
-    },
-    apiCosts: {
-      visionInput: totalVisionInputCost.toFixed(4),
-      visionOutput: totalVisionOutputCost.toFixed(4),
-      tts: totalTTSCost.toFixed(4),
-      total: totalCost.toFixed(4)
-    },
-    estimates: {
-      totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
-      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
-    }
-  };
-
-  return costBreakdown;
-}
+import { Config } from '../config/config';
+import { CostBreakdown } from '../interfaces';
+import { getVideoDuration } from './mediaUtils';
+
+type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
+
+/**
+ * Estimate the cost of generating audio descriptions for a video
+ * @param videoFilePath - Path to the input video file
+ * @param options - Optional configuration overrides
+ * @returns Cost estimation breakdown
+ */
+export async function estimateCost(
+  videoFilePath: string, 
+  options: Partial<Config> = {}
+): Promise<CostBreakdown> {
+  // Merge provided options with defaults
+  const settings = { ...options } as Config;
+
+  // Get video duration
+  const videoDuration = getVideoDuration(videoFilePath);
+  console.log(`Video duration: ${videoDuration} seconds`);
+
+  // Calculate the number of frames or batches to process
+  let totalUnits: number;
+  let unitCostMultiplier: number;
+  let unitType: string;
+  
+  if (settings.batchTimeMode) {
+    totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
+    unitCostMultiplier = settings.framesInBatch;
+    unitType = "batches";
+  } else {
+    totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
+    unitCostMultiplier = 1;
+    unitType = "frames";
+  }
+  
+  console.log(`Will process ${totalUnits} ${unitType}`);
+
+  // Pricing constants (per 1K units unless otherwise noted)
+  const pricing: {
+    vision: Record<string, Record<string, { input: number; output: number }>>;
+    tts: Record<string, Record<string, TTSPricingModel>>;
+  } = {
+    vision: {
+      openai: {
+        'gpt-4o': { input: 0.0025, output: 0.01 },
+        'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
+        'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
+      },
+      gemini: {
+        'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
+        'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
+        'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
+      },
+      openrouter: {
+        'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
+        'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
+        'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
+        'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
+      }
+    },
+    tts: {
+      openai: {
+        'tts-1': 0.015,
+        'tts-1-hd': 0.030,
+        'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
+      },
+      elevenlabs: {
+        'eleven_multilingual_v2': 0.30,
+        'eleven_turbo_v2.5': 0.015
+      },
+      google: {
+        'chirp-hd': 0.016,
+        'wavenet': 0.016,
+        'neural2': 0.016,
+        'standard': 0.004
+      }
+    }
+  };
+
+  // Get the pricing for the selected providers
+  const visionProvider = settings.visionProvider;
+  const visionModel = settings.visionProviders[visionProvider].model;
+  const ttsProvider = settings.ttsProvider;
+  const ttsModel = settings.ttsProviders[ttsProvider].model;
+
+  // Check if the pricing data exists
+  const visionPricing = pricing.vision[visionProvider]?.[visionModel];
+  const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
+
+  if (!visionPricing) {
+    console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
+  }
+
+  if (!ttsPricing) {
+    console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
+  }
+
+  // Estimated token counts
+  const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
+  const estimatedPromptTokens = 100;
+  const estimatedOutputTokensPerUnit = 75;
+
+  // Estimated character counts for TTS
+  const estimatedCharsPerDescription = 200;
+
+  // Calculate estimated costs for first unit
+  const firstUnitCost = {
+    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
+    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
+    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
+  };
+
+  // For subsequent units, we need context (e.g., previous frames)
+  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
+  
+  const subsequentUnitCost = {
+    visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
+    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
+    tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
+  };
+
+  // Calculate total costs
+  const totalVisionInputCost =
+    firstUnitCost.visionInput +
+    (totalUnits - 1) * subsequentUnitCost.visionInput;
+
+  const totalVisionOutputCost =
+    firstUnitCost.visionOutput +
+    (totalUnits - 1) * subsequentUnitCost.visionOutput;
+
+  const totalTTSCost =
+    firstUnitCost.tts +
+    (totalUnits - 1) * subsequentUnitCost.tts;
+
+  const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
+
+  // Create cost breakdown
+  const costBreakdown: CostBreakdown = {
+    videoInfo: {
+      duration: videoDuration,
+      totalUnits: totalUnits,
+      unitType: unitType,
+      processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
+    },
+    providerInfo: {
+      visionProvider: visionProvider,
+      visionModel: visionModel,
+      ttsProvider: ttsProvider,
+      ttsModel: ttsModel
+    },
+    apiCosts: {
+      visionInput: totalVisionInputCost.toFixed(4),
+      visionOutput: totalVisionOutputCost.toFixed(4),
+      tts: totalTTSCost.toFixed(4),
+      total: totalCost.toFixed(4)
+    },
+    estimates: {
+      totalAPICallsToProviders: totalUnits * 2,
+      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
+    }
+  };
+
+  return costBreakdown;
+}
+
+function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
+  if (!pricing) return 0;
+
+  if (typeof pricing === 'number') {
+    // Per-character pricing: cost per 1000 characters
+    return charCount * pricing / 1000;
+  }
+
+  // Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
+  // Rough estimate: 1 char ≈ 0.25 tokens for English text
+  const estimatedInputTokens = charCount * 0.25;
+  const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
+  return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
+}
--- a/src/utils/processor.ts
+++ b/src/utils/processor.ts
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
      voice: settings.ttsVoice,
      model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
    });

    const audioDuration = ttsResult.duration;
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
      voice: settings.ttsVoice,
      model: settings.ttsProviders[settings.ttsProvider].model,
-      speedFactor: settings.ttsSpeedFactor
+      speedFactor: settings.ttsSpeedFactor,
+      instructions: settings.ttsInstructions
    });

    const audioDuration = ttsResult.duration;
Author	SHA1	Message	Date
Talon	3a198d7d50	Update cost estimator with all new models and per-token TTS pricing support	2026-05-13 02:44:30 +02:00
Talon	f05e57493c	Add Google Cloud Chirp 3 TTS provider with service account support	2026-05-13 02:42:54 +02:00
Talon	6e9a26557f	Add OpenRouter vision provider for multi-model access via unified API	2026-05-13 02:40:03 +02:00
Talon	eb15af3a36	Add ElevenLabs TTS provider with segment continuity support	2026-05-13 02:38:16 +02:00
Talon	19975917c5	Enhance OpenAI TTS: add gpt-4o-mini-tts support with instructions parameter	2026-05-13 02:36:46 +02:00