From 78730c2ce9f75bb3a576d1ea3808609106cae8b6 Mon Sep 17 00:00:00 2001
From: Talon <talon@iamtalon.me>
Date: Wed, 12 Mar 2025 14:43:43 +0100
Subject: [PATCH] Abstract tts and vision providers

---
 index.js | 1433 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 850 insertions(+), 583 deletions(-)

diff --git a/index.js b/index.js
index 8b6641f..7fb00af 100644
--- a/index.js
+++ b/index.js
@@ -11,26 +11,45 @@ const { hideBin } = require('yargs/helpers');
 // Load environment variables
 dotenv.config();
 
-// Initialize OpenAI client
-const openai = new OpenAI({
-  apiKey: process.env.OPENAI_API_KEY,
-});
-
 // Default configuration options
 const defaultConfig = {
   captureIntervalSeconds: 10,
   contextWindowSize: 5,
   defaultPrompt: "Describe this frame from a video in 1-2 sentences for someone who cannot see it. Focus on key visual elements. Avoid using terms like 'in this frame', simply describe the actual frame. Keep sentences short and concise, as this will be used to generate an audio track which is overlayed on the video.",
   changePrompt: "Describe what has changed between these frames in 1-2 sentences for someone who cannot see the video. Focus on significant visual changes only. Avoid talking about meta information such as 'in this frame', or 'the significant change is', and merely describe the actual change taking place. Only describe the changes relevant to the last frame. The previous frames are attached for you to build context and build situational awareness. Keep it short and concise, as your text will be used to generate audio description tracks to be played with the video.",
-  ttsModel: "tts-1-hd", // OpenAI's TTS model
+  batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
+  
+  // Vision AI settings
+  visionProvider: "openai",
+  visionModel: "gpt-4o",
+  visionProviders: {
+    openai: {
+      apiKey: process.env.OPENAI_API_KEY,
+      model: "gpt-4o",
+      maxTokens: 300
+    },
+    // Add other vision providers here
+  },
+  
+  // TTS settings
+  ttsProvider: "openai",
   ttsVoice: "alloy", // Voice option for TTS
   ttsSpeedFactor: 1.5, // Speed up audio by 50%
+  ttsProviders: {
+    openai: {
+      apiKey: process.env.OPENAI_API_KEY,
+      model: "tts-1-hd",
+      voice: "alloy"
+    },
+    // Add other TTS providers here
+  },
+  
+  // Video processing settings
   outputDir: "/mnt/e/desc/output/",
   tempDir: "/mnt/e/desc/temp/",
   batchTimeMode: true,                // Whether to use the new batch time mode
   batchWindowDuration: 15,             // How many seconds each batch covers
   framesInBatch: 10,                   // How many frames to capture within each batch
-  batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description."
 };
 
 let stats = {
@@ -42,6 +61,301 @@ let stats = {
   totalCost: 0
 };
 
+/**
+ * Factory for creating vision AI providers
+ */
+class VisionProviderFactory {
+  static getProvider(config) {
+    const providerName = config.visionProvider;
+    const providerConfig = config.visionProviders[providerName];
+    
+    if (!providerConfig) {
+      throw new Error(`Vision provider "${providerName}" not configured.`);
+    }
+    
+    switch (providerName) {
+      case 'openai':
+        return new OpenAIVisionProvider(providerConfig);
+      // Add other providers here
+      default:
+        throw new Error(`Vision provider "${providerName}" not implemented.`);
+    }
+  }
+}
+
+/**
+ * Factory for creating TTS providers
+ */
+class TTSProviderFactory {
+  static getProvider(config) {
+    const providerName = config.ttsProvider;
+    const providerConfig = config.ttsProviders[providerName];
+    
+    if (!providerConfig) {
+      throw new Error(`TTS provider "${providerName}" not configured.`);
+    }
+    
+    switch (providerName) {
+      case 'openai':
+        return new OpenAITTSProvider(providerConfig);
+      // Add other providers here
+      default:
+        throw new Error(`TTS provider "${providerName}" not implemented.`);
+    }
+  }
+}
+
+/**
+ * OpenAI Vision Provider Implementation
+ */
+class OpenAIVisionProvider {
+  constructor(config) {
+    this.config = config;
+    this.openai = new OpenAI({
+      apiKey: config.apiKey,
+    });
+  }
+
+  /**
+   * Describe a single image
+   * @param {string} imagePath - Path to the image file
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeImage(imagePath, prompt) {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const base64Image = imageData.toString('base64');
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: prompt },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content.trim(),
+        usage: {
+          inputTokens: response.usage.prompt_tokens,
+          outputTokens: response.usage.completion_tokens,
+          totalTokens: response.usage.total_tokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing image:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe the differences
+   * @param {string} image1Path - Path to the first image
+   * @param {string} image2Path - Path to the second image
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async compareImages(image1Path, image2Path, prompt) {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+
+      const base64Image1 = image1Data.toString('base64');
+      const base64Image2 = image2Data.toString('base64');
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: prompt },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image1}`
+                }
+              },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image2}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content.trim(),
+        usage: {
+          inputTokens: response.usage.prompt_tokens,
+          outputTokens: response.usage.completion_tokens,
+          totalTokens: response.usage.total_tokens
+        }
+      };
+    } catch (error) {
+      console.error("Error comparing images:", error);
+      return {
+        description: "Unable to describe the differences between these images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param {string[]} imagePaths - Array of paths to the images
+   * @param {object} lastBatchContext - Context from the previous batch
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeBatch(imagePaths, lastBatchContext, prompt) {
+    try {
+      // Convert images to base64
+      const imagesBase64 = imagePaths.map(fp => {
+        const imageData = fs.readFileSync(fp);
+        return imageData.toString('base64');
+      });
+
+      // Build the messages array for the chat completion
+      const messages = [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: prompt }
+          ]
+        }
+      ];
+
+      // If we have some text context from the last batch, inject that as well
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        messages.unshift({
+          role: "system",
+          content: `Previous batch summary: ${lastBatchContext.lastDescription}`
+        });
+      }
+
+      // Append each image in the new batch
+      imagesBase64.forEach(base64 => {
+        messages[messages.length - 1].content.push({
+          type: "image_url",
+          image_url: {
+            url: `data:image/jpeg;base64,${base64}`
+          }
+        });
+      });
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        messages,
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content.trim(),
+        usage: {
+          inputTokens: response.usage.prompt_tokens,
+          outputTokens: response.usage.completion_tokens,
+          totalTokens: response.usage.total_tokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing batch of images:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
+
+/**
+ * OpenAI TTS Provider Implementation
+ */
+class OpenAITTSProvider {
+  constructor(config) {
+    this.config = config;
+    this.openai = new OpenAI({
+      apiKey: config.apiKey,
+    });
+  }
+
+  /**
+   * Convert text to speech
+   * @param {string} text - Text to convert to speech
+   * @param {string} outputPath - Output path for the audio file
+   * @param {object} options - Additional options
+   * @returns {Promise<{duration: number, cost: number}>} Duration of the generated audio in seconds and cost
+   */
+  async textToSpeech(text, outputPath, options = {}) {
+    try {
+      // Get the options, with defaults from config
+      const voice = options.voice || this.config.voice;
+      const model = options.model || this.config.model;
+      const speedFactor = options.speedFactor || 1.0;
+      
+      // Generate the initial TTS output
+      const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
+
+      const mp3 = await this.openai.audio.speech.create({
+        model: model,
+        voice: voice,
+        input: text
+      });
+
+      // Cost calculation is based on character count
+      const cost = text.length;
+
+      const buffer = Buffer.from(await mp3.arrayBuffer());
+      fs.writeFileSync(tempOutputPath, buffer);
+
+      // Speed up the audio using FFmpeg if needed
+      if (speedFactor !== 1.0) {
+        execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
+        // Clean up temporary file
+        fs.unlinkSync(tempOutputPath);
+      } else {
+        // Just use the file as is
+        fs.renameSync(tempOutputPath, outputPath);
+      }
+
+      // Get actual audio duration for accurate timing
+      const audioDuration = getAudioDuration(outputPath);
+      
+      return {
+        duration: audioDuration,
+        cost: cost
+      };
+    } catch (error) {
+      console.error("Error generating speech:", error);
+      // Create a silent audio file if TTS fails
+      execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
+      return {
+        duration: 1,
+        cost: 0
+      };
+    }
+  }
+}
+
 /**
  * Parse command line arguments
  */
@@ -62,16 +376,28 @@ function parseCommandLineArgs() {
       describe: 'Number of frames to keep in context',
       type: 'number'
     })
+    // Vision provider options
+    .option('visionProvider', {
+      describe: 'Provider to use for vision AI',
+      type: 'string'
+    })
+    .option('visionModel', {
+      describe: 'Model to use for vision AI',
+      type: 'string'
+    })
+    // TTS provider options
+    .option('ttsProvider', {
+      describe: 'Provider to use for text-to-speech',
+      type: 'string'
+    })
     .option('ttsModel', {
       alias: 'm',
-      describe: 'OpenAI TTS model to use',
-      choices: ['tts-1', 'tts-1-hd'],
+      describe: 'TTS model to use',
       type: 'string'
     })
     .option('ttsVoice', {
       alias: 'v',
       describe: 'Voice to use for text-to-speech',
-      choices: ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'],
       type: 'string'
     })
     .option('ttsSpeedFactor', {
@@ -132,14 +458,13 @@ function parseCommandLineArgs() {
     .help()
     .alias('help', 'h')
     .example('$0 video.mp4', 'Process a video with default settings')
-    .example('$0 video.mp4 --captureIntervalSeconds 10 --ttsVoice nova', 'Process with custom interval and voice')
+    .example('$0 video.mp4 --ttsVoice nova --visionProvider openai', 'Process with custom voice and vision provider')
     .example('$0 video.mp4 --estimate', 'Only estimate the processing cost')
     .example('$0 video.mp4 --config myconfig.json', 'Use settings from a config file')
     .example('$0 video.mp4 --saveConfig myconfig.json', 'Save current settings to a config file')
     .argv;
 }
 
-
 /**
  * Get the duration of a video file in seconds
  * @param {string} videoFilePath - Path to the video file
@@ -170,451 +495,6 @@ function captureVideoFrame(videoFilePath, timePosition, outputPath, lowQuality =
   execSync(command);
 }
 
-/**
- * Describe a single video frame using AI
- * @param {string} framePath - Path to the frame image
- * @param {string} prompt - Prompt for the AI
- * @returns {string} Description of the frame
- */
-async function describeFrame(framePath, prompt) {
-  try {
-    const imageData = fs.readFileSync(framePath);
-    const base64Image = imageData.toString('base64');
-
-    const response = await openai.chat.completions.create({
-      model: "gpt-4o",
-      messages: [
-        {
-          role: "user",
-          content: [
-            { type: "text", text: prompt },
-            {
-              type: "image_url",
-              image_url: {
-                url: `data:image/jpeg;base64,${base64Image}`
-              }
-            }
-          ]
-        }
-      ],
-      max_tokens: 300
-    });
-    stats.totalVisionInputCost += response.usage.prompt_tokens;
-    stats.totalVisionOutputCost += response.usage.completion_tokens;
-    stats.totalCost += response.usage.total_tokens;
-
-
-    return response.choices[0].message.content.trim();
-  } catch (error) {
-    console.error("Error describing frame:", error);
-    return "Unable to describe this frame.";
-  }
-}
-
-/**
- * Describe changes between two frames using AI
- * @param {string} previousFramePath - Path to the previous frame
- * @param {string} currentFramePath - Path to the current frame
- * @param {string} prompt - Prompt for the AI
- * @returns {string} Description of changes between frames
- */
-async function describeFrameChange(previousFramePath, currentFramePath, prompt) {
-  try {
-    const previousImageData = fs.readFileSync(previousFramePath);
-    const currentImageData = fs.readFileSync(currentFramePath);
-
-    const previousBase64 = previousImageData.toString('base64');
-    const currentBase64 = currentImageData.toString('base64');
-
-    const response = await openai.chat.completions.create({
-      model: "gpt-4o",
-      messages: [
-        {
-          role: "user",
-          content: [
-            { type: "text", text: prompt },
-            {
-              type: "image_url",
-              image_url: {
-                url: `data:image/jpeg;base64,${previousBase64}`
-              }
-            },
-            {
-              type: "image_url",
-              image_url: {
-                url: `data:image/jpeg;base64,${currentBase64}`
-              }
-            }
-          ]
-        }
-      ],
-      max_tokens: 300
-    });
-    stats.totalVisionInputCost += response.usage.prompt_tokens;
-    stats.totalVisionOutputCost += response.usage.completion_tokens;
-    stats.totalCost += response.usage.total_tokens;
-
-    return response.choices[0].message.content.trim();
-  } catch (error) {
-    console.error("Error describing frame change:", error);
-    return "Unable to describe changes between frames.";
-  }
-}
-
-/**
- * Describe a batch of frames using AI, optionally providing context (last batch's data).
- * @param {string[]} framePaths - Array of file paths for this batch's frames
- * @param {object} lastBatchContext - Data from the previous batch (e.g., text or images)
- * @param {string} batchPrompt - Prompt text for describing a batch
- * @returns {string} AI-generated batch description
- */
-async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt) {
-  try {
-    // Convert images to base64
-    const imagesBase64 = framePaths.map(fp => {
-      const imageData = fs.readFileSync(fp);
-      return imageData.toString('base64');
-    });
-
-    // Build the messages array for the chat completion
-    // Start with a system or user message that includes the prompt
-    const messages = [
-      {
-        role: "user",
-        content: [
-          { type: "text", text: batchPrompt }
-        ]
-      }
-    ];
-
-    // If we have some text context from the last batch, inject that as well
-    if (lastBatchContext && lastBatchContext.lastDescription) {
-      // E.g., add it as a "system" or "user" message depending on your style
-      messages.unshift({
-        role: "system",
-        content: `Previous batch summary: ${lastBatchContext.lastDescription}`
-      });
-    }
-
-    // Append each image in the new batch
-    imagesBase64.forEach(base64 => {
-      messages[messages.length - 1].content.push({
-        type: "image_url",
-        image_url: {
-          url: `data:image/jpeg;base64,${base64}`
-        }
-      });
-    });
-
-    const response = await openai.chat.completions.create({
-      model: "gpt-4o",
-      messages,
-      max_tokens: 300
-    });
-    stats.totalVisionInputCost += response.usage.prompt_tokens;
-    stats.totalVisionOutputCost += response.usage.completion_tokens;
-    stats.totalCost += response.usage.total_tokens;
-
-    return response.choices[0].message.content.trim();
-  } catch (error) {
-    console.error("Error describing batch of frames:", error);
-    return "Unable to describe this batch of frames.";
-  }
-}
-
-// Modified function to prevent audio overlap
-async function generateAudioDescription(videoFilePath, options = {}) {
-  // Merge provided options with defaults
-  const settings = { ...defaultConfig, ...options };
-
-  // Ensure temporary and output directories exist
-  if (!fs.existsSync(settings.tempDir)) {
-    fs.mkdirSync(settings.tempDir, { recursive: true });
-  }
-  if (!fs.existsSync(settings.outputDir)) {
-    fs.mkdirSync(settings.outputDir, { recursive: true });
-  }
-
-  // Get video duration
-  const videoDuration = getVideoDuration(videoFilePath);
-  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
-  console.log(`Video duration: ${videoDuration} seconds`);
-
-  // If batchTimeMode is enabled, use the new approach
-  if (settings.batchTimeMode) {
-    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
-  }
-
-  // Calculate the number of frames to capture
-  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
-  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
-
-  // Context window to store previous frames
-  const frameContext = [];
-
-  // Array to store audio segment information
-  const audioSegments = [];
-
-  // Track our current time position (will be adjusted for audio overlap)
-  let currentTimePosition = 0;
-
-  // Track drift from the original schedule
-  let timelineDrift = 0;
-  const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
-
-  // Process each frame
-  for (let i = 0; i < totalFrames; i++) {
-    // Calculate the ideal time position based on the original schedule
-    const idealTimePosition = i * settings.captureIntervalSeconds;
-
-    // Use the adjusted time position that accounts for previous audio durations
-    const timePosition = currentTimePosition;
-
-    // Calculate drift from the original schedule
-    timelineDrift = timePosition - idealTimePosition;
-
-    // Log if drift is becoming significant
-    if (Math.abs(timelineDrift) > maxAllowableDrift) {
-      console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
-    }
-
-    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
-
-    // Capture frame at current time position (use the ideal time to capture the frame)
-    captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
-    console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
-
-    // Add current frame to context
-    const currentFrame = {
-      index: i,
-      path: frameFilePath,
-      timePosition
-    };
-
-    frameContext.push(currentFrame);
-
-    // Keep context window at specified size
-    if (frameContext.length > settings.contextWindowSize) {
-      frameContext.shift();
-    }
-
-    // Generate description
-    let description;
-    if (frameContext.length === 1) {
-      // First frame - just describe what's in it
-      description = await describeFrame(frameFilePath, settings.defaultPrompt);
-    } else {
-      // Compare with previous frame
-      const previousFrame = frameContext[frameContext.length - 2];
-      description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
-    }
-
-    console.log(`Description: ${description}`);
-
-    // Generate speech from description
-    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
-    const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
-
-    console.log(`Audio duration: ${audioDuration} seconds`);
-
-    // Store segment information
-    audioSegments.push({
-      audioFile: audioFilePath,
-      startTime: timePosition,
-      duration: audioDuration,
-      description
-    });
-
-    // Update the time position for the next iteration
-    // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
-    const bufferTime = 0.25;
-    currentTimePosition = timePosition + audioDuration + bufferTime;
-
-    // If we've fallen behind schedule, try to catch up (but don't skip content)
-    const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
-    if (currentTimePosition < nextIdealPosition) {
-      console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
-      currentTimePosition = nextIdealPosition;
-      timelineDrift = 0; // Reset drift since we've caught up
-    }
-  }
-
-  // Combine audio segments into final audio description track
-  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
-  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
-
-  // Clean up temporary files if desired
-  // cleanupTempFiles(settings.tempDir);
-
-  console.log(`\nAudio description generated: ${outputAudioPath}`);
-  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
-  printStats(stats, settings);
-
-  return {
-    videoFile: videoFilePath,
-    audioDescriptionFile: outputAudioPath
-  };
-}
-
-/**
- * Generate audio description using the new "batch time" mode with overlap prevention.
- * @param {string} videoFilePath - Path to the input video file
- * @param {number} videoDuration - Duration of the video in seconds
- * @param {object} settings - The merged config and user options
- */
-async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
-  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
-  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
-
-  // We'll hold the last batch's frames or last batch's description for context
-  let lastBatchContext = [];
-
-  const audioSegments = [];
-
-  // Track our current time position (will be adjusted for audio overlap)
-  let currentTimePosition = 0;
-
-  // Track drift from the original schedule
-  let timelineDrift = 0;
-  const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
-
-  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
-    // Calculate ideal batch timing based on configuration
-    const idealBatchStart = batchIndex * settings.batchWindowDuration;
-
-    // Use adjusted time position that accounts for previous audio durations
-    const batchStart = currentTimePosition;
-
-    // Calculate drift from the original schedule
-    timelineDrift = batchStart - idealBatchStart;
-
-    // Log if drift is becoming significant
-    if (Math.abs(timelineDrift) > maxAllowableDrift) {
-      console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
-    }
-
-    const batchEnd = idealBatchStart + settings.batchWindowDuration;
-    if (batchEnd > videoDuration) break; // Safety check
-
-    console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
-
-    // Capture frames for this batch - use the ideal timing for frame capture
-    const framePaths = [];
-    for (let i = 0; i < settings.framesInBatch; i++) {
-      const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
-      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
-      captureVideoFrame(videoFilePath, t, frameFilePath);
-      framePaths.push(frameFilePath);
-    }
-
-    // Use AI to describe this batch of frames, possibly providing some context
-    let description = await describeBatchOfFrames(
-      framePaths,
-      lastBatchContext,
-      settings.batchPrompt
-    );
-
-    console.log(`Batch #${batchIndex} description:\n${description}\n`);
-
-    // Convert description to TTS
-    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
-    const audioDuration = await textToSpeech(
-      description,
-      audioFilePath,
-      settings.ttsModel,
-      settings.ttsVoice,
-      settings.ttsSpeedFactor
-    );
-
-    console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
-
-    // Store segment info with the adjusted start time
-    audioSegments.push({
-      audioFile: audioFilePath,
-      startTime: batchStart,
-      duration: audioDuration,
-      description
-    });
-
-    // Update the time position for the next iteration
-    // Add a small buffer (0.5 sec) between descriptions
-    const bufferTime = 0.5;
-    currentTimePosition = batchStart + audioDuration + bufferTime;
-
-    // If we've fallen behind schedule, try to catch up (but don't skip content)
-    const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
-    if (currentTimePosition < nextIdealPosition) {
-      console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
-      currentTimePosition = nextIdealPosition;
-      timelineDrift = 0; // Reset drift since we've caught up
-    }
-
-    // Update lastBatchContext so the next batch can keep track of what's previously seen
-    lastBatchContext = {
-      lastDescription: description,
-      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
-    };
-  }
-
-  // Combine all the audio segments into one track
-  const outputAudioPath = path.join(
-    settings.outputDir,
-    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
-  );
-  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
-
-  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
-  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
-  printStats(stats, settings);
-
-  return {
-    videoFile: videoFilePath,
-    audioDescriptionFile: outputAudioPath
-  };
-}
-
-/**
- * Convert text to speech using AI with speed adjustment, and return the actual duration
- * @param {string} text - Text to convert to speech
- * @param {string} outputPath - Output path for the audio file
- * @param {string} model - TTS model to use
- * @param {string} voice - Voice to use for TTS
- * @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
- * @returns {number} The actual duration of the generated audio in seconds
- */
-async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
-  try {
-    // Generate the initial TTS output
-    const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
-
-    const mp3 = await openai.audio.speech.create({
-      model: model,
-      voice: voice,
-      input: text
-    });
-
-    stats.totalTTSCost += text.length;
-
-    const buffer = Buffer.from(await mp3.arrayBuffer());
-    fs.writeFileSync(tempOutputPath, buffer);
-
-    // Speed up the audio using FFmpeg
-    execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
-
-    // Clean up temporary file
-    fs.unlinkSync(tempOutputPath);
-
-    // Get actual audio duration for accurate timing
-    const audioDuration = getAudioDuration(outputPath);
-    return audioDuration;
-  } catch (error) {
-    console.error("Error generating speech:", error);
-    // Create a silent audio file if TTS fails
-    execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
-    return 1; // Return a default duration of 1 second
-  }
-}
-
 /**
  * Get the duration of an audio file in seconds
  * @param {string} audioFilePath - Path to the audio file
@@ -893,12 +773,500 @@ function cleanupTempFiles(tempDir) {
   }
 }
 
+/**
+ * Generate audio description for a video
+ * @param {string} videoFilePath - Path to the input video file
+ * @param {object} options - Optional configuration overrides
+ * @returns {Promise<object>} Result of the operation
+ */
+async function generateAudioDescription(videoFilePath, options = {}) {
+  // Merge provided options with defaults
+  const settings = { ...defaultConfig, ...options };
+
+  // Initialize providers
+  const visionProvider = VisionProviderFactory.getProvider(settings);
+  const ttsProvider = TTSProviderFactory.getProvider(settings);
+
+  // Ensure temporary and output directories exist
+  if (!fs.existsSync(settings.tempDir)) {
+    fs.mkdirSync(settings.tempDir, { recursive: true });
+  }
+  if (!fs.existsSync(settings.outputDir)) {
+    fs.mkdirSync(settings.outputDir, { recursive: true });
+  }
+
+  // Get video duration
+  const videoDuration = getVideoDuration(videoFilePath);
+  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+  console.log(`Video duration: ${videoDuration} seconds`);
+
+  // If batchTimeMode is enabled, use the new approach
+  if (settings.batchTimeMode) {
+    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider);
+  }
+
+  // Calculate the number of frames to capture
+  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
+
+  // Context window to store previous frames
+  const frameContext = [];
+
+  // Array to store audio segment information
+  const audioSegments = [];
+
+  // Track our current time position (will be adjusted for audio overlap)
+  let currentTimePosition = 0;
+
+  // Track drift from the original schedule
+  let timelineDrift = 0;
+  const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
+
+  // Process each frame
+  for (let i = 0; i < totalFrames; i++) {
+    // Calculate the ideal time position based on the original schedule
+    const idealTimePosition = i * settings.captureIntervalSeconds;
+
+    // Use the adjusted time position that accounts for previous audio durations
+    const timePosition = currentTimePosition;
+
+    // Calculate drift from the original schedule
+    timelineDrift = timePosition - idealTimePosition;
+
+    // Log if drift is becoming significant
+    if (Math.abs(timelineDrift) > maxAllowableDrift) {
+      console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
+    }
+
+    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
+
+    // Capture frame at current time position (use the ideal time to capture the frame)
+    captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
+    console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
+
+    // Add current frame to context
+    const currentFrame = {
+      index: i,
+      path: frameFilePath,
+      timePosition
+    };
+
+    frameContext.push(currentFrame);
+
+    // Keep context window at specified size
+    if (frameContext.length > settings.contextWindowSize) {
+      frameContext.shift();
+    }
+
+    // Generate description
+    let description;
+    let usageStats;
+    
+    if (frameContext.length === 1) {
+      // First frame - just describe what's in it
+      const result = await visionProvider.describeImage(frameFilePath, settings.defaultPrompt);
+      description = result.description;
+      usageStats = result.usage;
+    } else {
+      // Compare with previous frame
+      const previousFrame = frameContext[frameContext.length - 2];
+      const result = await visionProvider.compareImages(previousFrame.path, frameFilePath, settings.changePrompt);
+      description = result.description;
+      usageStats = result.usage;
+    }
+
+    // Update stats
+    stats.totalVisionInputCost += usageStats.inputTokens;
+    stats.totalVisionOutputCost += usageStats.outputTokens;
+    stats.totalCost += usageStats.totalTokens;
+
+    console.log(`Description: ${description}`);
+
+    // Generate speech from description
+    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
+    
+    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
+      voice: settings.ttsVoice,
+      model: settings.ttsProviders[settings.ttsProvider].model,
+      speedFactor: settings.ttsSpeedFactor
+    });
+
+    const audioDuration = ttsResult.duration;
+    stats.totalTTSCost += ttsResult.cost;
+
+    console.log(`Audio duration: ${audioDuration} seconds`);
+
+    // Store segment information
+    audioSegments.push({
+      audioFile: audioFilePath,
+      startTime: timePosition,
+      duration: audioDuration,
+      description
+    });
+
+    // Update the time position for the next iteration
+    // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
+    const bufferTime = 0.25;
+    currentTimePosition = timePosition + audioDuration + bufferTime;
+
+    // If we've fallen behind schedule, try to catch up (but don't skip content)
+    const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
+    if (currentTimePosition < nextIdealPosition) {
+      console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
+      currentTimePosition = nextIdealPosition;
+      timelineDrift = 0; // Reset drift since we've caught up
+    }
+  }
+
+  // Combine audio segments into final audio description track
+  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
+  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
+
+  // Clean up temporary files if desired
+  // cleanupTempFiles(settings.tempDir);
+
+  console.log(`\nAudio description generated: ${outputAudioPath}`);
+  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+  printStats(stats, settings);
+
+  return {
+    videoFile: videoFilePath,
+    audioDescriptionFile: outputAudioPath
+  };
+}
+
+/**
+ * Generate audio description using the "batch time" mode with overlap prevention.
+ * @param {string} videoFilePath - Path to the input video file
+ * @param {number} videoDuration - Duration of the video in seconds
+ * @param {object} settings - The merged config and user options
+ * @param {object} visionProvider - The vision provider instance
+ * @param {object} ttsProvider - The TTS provider instance
+ */
+async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider) {
+  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
+  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
+
+  // We'll hold the last batch's frames or last batch's description for context
+  let lastBatchContext = [];
+
+  const audioSegments = [];
+
+  // Track our current time position (will be adjusted for audio overlap)
+  let currentTimePosition = 0;
+
+  // Track drift from the original schedule
+  let timelineDrift = 0;
+  const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
+
+  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
+    // Calculate ideal batch timing based on configuration
+    const idealBatchStart = batchIndex * settings.batchWindowDuration;
+
+    // Use adjusted time position that accounts for previous audio durations
+    const batchStart = currentTimePosition;
+
+    // Calculate drift from the original schedule
+    timelineDrift = batchStart - idealBatchStart;
+
+    // Log if drift is becoming significant
+    if (Math.abs(timelineDrift) > maxAllowableDrift) {
+      console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
+    }
+
+    const batchEnd = idealBatchStart + settings.batchWindowDuration;
+    if (batchEnd > videoDuration) break; // Safety check
+
+    console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
+
+    // Capture frames for this batch - use the ideal timing for frame capture
+    const framePaths = [];
+    for (let i = 0; i < settings.framesInBatch; i++) {
+      const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
+      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
+      captureVideoFrame(videoFilePath, t, frameFilePath);
+      framePaths.push(frameFilePath);
+    }
+
+    // Use AI to describe this batch of frames, possibly providing some context
+    const result = await visionProvider.describeBatch(
+      framePaths,
+      lastBatchContext,
+      settings.batchPrompt
+    );
+
+    const description = result.description;
+    const usageStats = result.usage;
+
+    // Update stats
+    stats.totalVisionInputCost += usageStats.inputTokens;
+    stats.totalVisionOutputCost += usageStats.outputTokens;
+    stats.totalCost += usageStats.totalTokens;
+
+    console.log(`Batch #${batchIndex} description:\n${description}\n`);
+
+    // Convert description to TTS
+    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
+    
+    const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
+      voice: settings.ttsVoice,
+      model: settings.ttsProviders[settings.ttsProvider].model,
+      speedFactor: settings.ttsSpeedFactor
+    });
+
+    const audioDuration = ttsResult.duration;
+    stats.totalTTSCost += ttsResult.cost;
+
+    console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
+
+    // Store segment info with the adjusted start time
+    audioSegments.push({
+      audioFile: audioFilePath,
+      startTime: batchStart,
+      duration: audioDuration,
+      description
+    });
+
+    // Update the time position for the next iteration
+    // Add a small buffer (0.5 sec) between descriptions
+    const bufferTime = 0.5;
+    currentTimePosition = batchStart + audioDuration + bufferTime;
+
+    // If we've fallen behind schedule, try to catch up (but don't skip content)
+    const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
+    if (currentTimePosition < nextIdealPosition) {
+      console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
+      currentTimePosition = nextIdealPosition;
+      timelineDrift = 0; // Reset drift since we've caught up
+    }
+
+    // Update lastBatchContext so the next batch can keep track of what's previously seen
+    lastBatchContext = {
+      lastDescription: description,
+      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
+    };
+  }
+
+  // Combine all the audio segments into one track
+  const outputAudioPath = path.join(
+    settings.outputDir,
+    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
+  );
+  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
+
+  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
+  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+  printStats(stats, settings);
+
+  return {
+    videoFile: videoFilePath,
+    audioDescriptionFile: outputAudioPath
+  };
+}
+
+/**
+ * Print out statistics
+ * @param {object} stats - Statistics object
+ * @param {object} settings - Configuration settings
+ */
+function printStats(stats, settings) {
+  // Pricing constants (as of March 2025, update as needed)
+  const pricing = {
+    // Get pricing based on vision provider
+    vision: {
+      openai: {
+        'gpt-4o': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.01    // per 1K output tokens
+        }
+        // Add other OpenAI models here
+      }
+      // Add other vision providers here
+    },
+    // Get pricing based on TTS provider
+    tts: {
+      openai: {
+        'tts-1': 0.015,      // per 1K characters
+        'tts-1-hd': 0.030    // per 1K characters
+      }
+      // Add other TTS providers here
+    }
+  };
+
+  // Get the pricing for the selected providers
+  const visionProvider = settings.visionProvider;
+  const visionModel = settings.visionProviders[visionProvider].model;
+  const ttsProvider = settings.ttsProvider;
+  const ttsModel = settings.ttsProviders[ttsProvider].model;
+
+  // Check if the pricing data exists
+  const visionPricing = pricing.vision[visionProvider]?.[visionModel];
+  const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
+
+  if (!visionPricing) {
+    console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
+  }
+
+  if (!ttsPricing) {
+    console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
+  }
+
+  // Calculate prices using available pricing data
+  const visionInputCost = visionPricing ? (stats.totalVisionInputCost * visionPricing.input / 1000) : 0;
+  const visionOutputCost = visionPricing ? (stats.totalVisionOutputCost * visionPricing.output / 1000) : 0;
+  const ttsCost = ttsPricing ? (stats.totalTTSCost * ttsPricing / 1000) : 0;
+  const totalCost = visionInputCost + visionOutputCost + ttsCost;
+
+  // Print out the stats
+  console.log('\n=== STATISTICS ===');
+  console.log(`Vision provider: ${visionProvider}, Model: ${visionModel}`);
+  console.log(`TTS provider: ${ttsProvider}, Model: ${ttsModel}`);
+  console.log(`Total vision input cost: ${visionInputCost.toFixed(4)}`);
+  console.log(`Total vision output cost: ${visionOutputCost.toFixed(4)}`);
+  console.log(`Total TTS cost: ${ttsCost.toFixed(4)}`);
+  console.log(`Total cost: ${totalCost.toFixed(4)}`);
+}
+
 /**
  * Estimate the cost of generating audio descriptions for a video
  * @param {string} videoFilePath - Path to the input video file
  * @param {object} options - Optional configuration overrides
  * @returns {object} Cost estimation breakdown
  */
+async function estimateCost(videoFilePath, options = {}) {
+  // Merge provided options with defaults
+  const settings = { ...defaultConfig, ...options };
+
+  // Get video duration
+  const videoDuration = getVideoDuration(videoFilePath);
+  console.log(`Video duration: ${videoDuration} seconds`);
+
+  // Calculate the number of frames or batches to process
+  let totalUnits;
+  let unitCostMultiplier;
+  let unitType;
+  
+  if (settings.batchTimeMode) {
+    totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
+    unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
+    unitType = "batches";
+  } else {
+    totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
+    unitCostMultiplier = 1; // No multiplier for normal mode
+    unitType = "frames";
+  }
+  
+  console.log(`Will process ${totalUnits} ${unitType}`);
+
+  // Pricing constants (as of March 2025, update as needed)
+  const pricing = {
+    // Get pricing based on vision provider
+    vision: {
+      openai: {
+        'gpt-4o': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.01    // per 1K output tokens
+        }
+        // Add other OpenAI models here
+      }
+      // Add other vision providers here
+    },
+    // Get pricing based on TTS provider
+    tts: {
+      openai: {
+        'tts-1': 0.015,      // per 1K characters
+        'tts-1-hd': 0.030    // per 1K characters
+      }
+      // Add other TTS providers here
+    }
+  };
+
+  // Get the pricing for the selected providers
+  const visionProvider = settings.visionProvider;
+  const visionModel = settings.visionProviders[visionProvider].model;
+  const ttsProvider = settings.ttsProvider;
+  const ttsModel = settings.ttsProviders[ttsProvider].model;
+
+  // Check if the pricing data exists
+  const visionPricing = pricing.vision[visionProvider]?.[visionModel];
+  const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
+
+  if (!visionPricing) {
+    console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
+  }
+
+  if (!ttsPricing) {
+    console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
+  }
+
+  // Estimated token counts
+  const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
+  const estimatedPromptTokens = 100; // Tokens for the prompt text
+  const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
+
+  // Estimated character counts for TTS
+  const estimatedCharsPerDescription = 200; // Average characters per description
+
+  // Calculate estimated costs for first unit
+  const firstUnitCost = {
+    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
+    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
+    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
+  };
+
+  // For subsequent units, we need context (e.g., previous frames)
+  const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
+  
+  const subsequentUnitCost = {
+    visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
+    visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
+    tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
+  };
+
+  // Calculate total costs
+  const totalVisionInputCost =
+    firstUnitCost.visionInput +
+    (totalUnits - 1) * subsequentUnitCost.visionInput;
+
+  const totalVisionOutputCost =
+    firstUnitCost.visionOutput +
+    (totalUnits - 1) * subsequentUnitCost.visionOutput;
+
+  const totalTTSCost =
+    firstUnitCost.tts +
+    (totalUnits - 1) * subsequentUnitCost.tts;
+
+  const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
+
+  // Create cost breakdown
+  const costBreakdown = {
+    videoInfo: {
+      duration: videoDuration,
+      totalUnits: totalUnits,
+      unitType: unitType,
+      processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
+    },
+    providerInfo: {
+      visionProvider: visionProvider,
+      visionModel: visionModel,
+      ttsProvider: ttsProvider,
+      ttsModel: ttsModel
+    },
+    apiCosts: {
+      visionInput: totalVisionInputCost.toFixed(4),
+      visionOutput: totalVisionOutputCost.toFixed(4),
+      tts: totalTTSCost.toFixed(4),
+      total: totalCost.toFixed(4)
+    },
+    estimates: {
+      totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
+      estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
+    }
+  };
+
+  return costBreakdown;
+}
+
 /**
  * Load configuration from a JSON file
  * @param {string} filePath - Path to the configuration file
@@ -935,131 +1303,6 @@ function saveConfigToFile(filePath, config) {
   }
 }
 
-/**
- * Print out statistics
- * @param {object} stats - Statistics object
- * @param {object} settings - Configuration settings
- */
-function printStats(stats, settings) {
-  // Pricing constants (as of March 2025, update as needed)
-  const pricing = {
-    // OpenAI pricing (per 1000 tokens)
-    gpt4o: {
-      input: 0.0025,  // $0.0025 per 1K input tokens
-      output: 0.01    // $0.01 per 1K output tokens
-    },
-    // TTS pricing (per 1000 characters)
-    tts: {
-      'tts-1': 0.015,      // $0.015 per 1K characters
-      'tts-1-hd': 0.030    // $0.030 per 1K characters
-    }
-  };
-
-  // Calculate prices. The stats object contains amount of tokens.
-  stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000;
-  stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000;
-  stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000;
-  stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost;
-
-  // Print out the stats
-  console.log('\n=== STATISTICS ===');
-  console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`);
-  console.log(`Total vision output cost: ${stats.totalVisionOutputCost.toFixed(4)}`);
-  console.log(`Total TTS cost: ${stats.totalTTSCost.toFixed(4)}`);
-  console.log(`Total cost: ${stats.totalCost.toFixed(4)}`);
-}
-
-async function estimateCost(videoFilePath, options = {}) {
-  // Merge provided options with defaults
-  const settings = { ...defaultConfig, ...options };
-
-  // Get video duration
-  const videoDuration = getVideoDuration(videoFilePath);
-  console.log(`Video duration: ${videoDuration} seconds`);
-
-  // Calculate the number of frames to capture
-  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
-  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
-
-  // Pricing constants (as of March 2025, update as needed)
-  const pricing = {
-    // OpenAI pricing (per 1000 tokens)
-    gpt4o: {
-      input: 0.01,  // $0.01 per 1K input tokens
-      output: 0.03  // $0.03 per 1K output tokens
-    },
-    // TTS pricing (per 1000 characters)
-    tts: {
-      'tts-1': 0.015,      // $0.015 per 1K characters
-      'tts-1-hd': 0.030    // $0.030 per 1K characters
-    }
-  };
-
-  // Check if the TTS model exists in our pricing table
-  if (!pricing.tts[settings.ttsModel]) {
-    console.warn(`Warning: TTS model "${settings.ttsModel}" not found in pricing table. Using tts-1-hd pricing.`);
-    settings.ttsModel = 'tts-1-hd'; // Fallback to a known model
-  }
-
-  // Estimated token counts
-  const estimatedVisionInputTokens = 1000; // Base tokens for the vision input (approx. for a single image)
-  const estimatedPromptTokens = 50; // Tokens for the prompt text
-  const estimatedOutputTokensPerFrame = 50; // Average tokens for description output
-
-  // Estimated character counts for TTS
-  const estimatedCharsPerDescription = 200; // Average characters per description
-
-  // Calculate estimated costs
-  const firstFrameCost = {
-    visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * pricing.gpt4o.input / 1000,
-    visionOutput: estimatedOutputTokensPerFrame * pricing.gpt4o.output / 1000,
-    tts: estimatedCharsPerDescription * pricing.tts[settings.ttsModel] / 1000
-  };
-
-  const subsequentFrameCost = {
-    // For subsequent frames, we need two images (previous + current)
-    visionInput: (estimatedVisionInputTokens * 2 + estimatedPromptTokens) * pricing.gpt4o.input / 1000,
-    visionOutput: estimatedOutputTokensPerFrame * pricing.gpt4o.output / 1000,
-    tts: estimatedCharsPerDescription * pricing.tts[settings.ttsModel] / 1000
-  };
-
-  // Calculate total costs
-  const totalVisionInputCost =
-    firstFrameCost.visionInput +
-    (totalFrames - 1) * subsequentFrameCost.visionInput;
-
-  const totalVisionOutputCost =
-    firstFrameCost.visionOutput +
-    (totalFrames - 1) * subsequentFrameCost.visionOutput;
-
-  const totalTTSCost =
-    firstFrameCost.tts +
-    (totalFrames - 1) * subsequentFrameCost.tts;
-
-  const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
-
-  // Create cost breakdown
-  const costBreakdown = {
-    videoInfo: {
-      duration: videoDuration,
-      totalFrames: totalFrames,
-      captureInterval: settings.captureIntervalSeconds
-    },
-    apiCosts: {
-      visionInput: totalVisionInputCost.toFixed(4),
-      visionOutput: totalVisionOutputCost.toFixed(4),
-      tts: totalTTSCost.toFixed(4),
-      total: totalCost.toFixed(4)
-    },
-    estimates: {
-      totalAPICallsToOpenAI: totalFrames * 2, // Vision + TTS for each frame
-      estimatedProcessingTimeMinutes: (totalFrames * 3) / 60 // rough estimate, 3 seconds per frame
-    }
-  };
-
-  return costBreakdown;
-}
-
 // Main execution
 if (require.main === module) {
   // Parse command line arguments
@@ -1083,6 +1326,28 @@ if (require.main === module) {
     }
   });
 
+  // Handle nested provider configurations
+  if (argv.visionModel) {
+    if (!config.visionProviders[config.visionProvider]) {
+      config.visionProviders[config.visionProvider] = {};
+    }
+    config.visionProviders[config.visionProvider].model = argv.visionModel;
+  }
+
+  if (argv.ttsModel) {
+    if (!config.ttsProviders[config.ttsProvider]) {
+      config.ttsProviders[config.ttsProvider] = {};
+    }
+    config.ttsProviders[config.ttsProvider].model = argv.ttsModel;
+  }
+
+  if (argv.ttsVoice) {
+    if (!config.ttsProviders[config.ttsProvider]) {
+      config.ttsProviders[config.ttsProvider] = {};
+    }
+    config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
+  }
+
   // Save configuration if requested
   if (argv.saveConfig) {
     saveConfigToFile(argv.saveConfig, config);
@@ -1123,5 +1388,7 @@ if (require.main === module) {
 module.exports = {
   generateAudioDescription,
   estimateCost,
-  config: defaultConfig
+  config: defaultConfig,
+  VisionProviderFactory,
+  TTSProviderFactory
 };
\ No newline at end of file