Schedule descriptions later if audio files are too long

2025-03-11 20:58:49 +01:00
parent a6cb8efc0c
commit 7e0b9cf220
1 changed files with 269 additions and 196 deletions
--- a/index.js
+++ b/index.js
@@ -139,106 +139,6 @@ function parseCommandLineArgs() {
    .argv;
 }
 /**
 * Main function to process a video file and generate audio descriptions
 * @param {string} videoFilePath - Path to the input video file
 * @param {object} options - Optional configuration overrides
 */
 async function generateAudioDescription(videoFilePath, options = {}) {
  // Merge provided options with defaults
  const settings = { ...defaultConfig, ...options };
  // Ensure temporary and output directories exist
  if (!fs.existsSync(settings.tempDir)) {
    fs.mkdirSync(settings.tempDir, { recursive: true });
  }
  if (!fs.existsSync(settings.outputDir)) {
    fs.mkdirSync(settings.outputDir, { recursive: true });
  }
  // Get video duration
  const videoDuration = getVideoDuration(videoFilePath);
  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
  console.log(`Video duration: ${videoDuration} seconds`);
  // If batchTimeMode is enabled, use the new approach
  if (settings.batchTimeMode) {
    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
  }
  // Calculate the number of frames to capture
  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
  // Context window to store previous frames
  const frameContext = [];
  // Array to store audio segment information
  const audioSegments = [];
  // Process each frame
  for (let i = 0; i < totalFrames; i++) {
    const timePosition = i * settings.captureIntervalSeconds;
    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
    // Capture frame at current time position
    captureVideoFrame(videoFilePath, timePosition, frameFilePath);
    console.log(`Captured frame at ${timePosition} seconds`);
    // Add current frame to context
    const currentFrame = {
      index: i,
      path: frameFilePath,
      timePosition
    };
    frameContext.push(currentFrame);
    // Keep context window at specified size
    if (frameContext.length > settings.contextWindowSize) {
      frameContext.shift();
    }
    // Generate description
    let description;
    if (frameContext.length === 1) {
      // First frame - just describe what's in it
      description = await describeFrame(frameFilePath, settings.defaultPrompt);
    } else {
      // Compare with previous frame
      const previousFrame = frameContext[frameContext.length - 2];
      description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
    }
    console.log(`Description: ${description}`);
    // Generate speech from description
    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
    await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
    // Store segment information
    audioSegments.push({
      audioFile: audioFilePath,
      startTime: timePosition,
      description
    });
  }
  // Combine audio segments into final audio description track
  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
  // Clean up temporary files if desired
  // cleanupTempFiles(settings.tempDir);
  console.log(`\nAudio description generated: ${outputAudioPath}`);
  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
  printStats(stats, settings);
  return {
    videoFile: videoFilePath,
    audioDescriptionFile: outputAudioPath
  };
 }
 /**
 * Get the duration of a video file in seconds
@@ -361,85 +261,6 @@ async function describeFrameChange(previousFramePath, currentFramePath, prompt)
  }
 }
 /**
 * Generate audio description using the new "batch time" mode.
 * @param {string} videoFilePath - Path to the input video file
 * @param {number} videoDuration - Duration of the video in seconds
 * @param {object} settings - The merged config and user options
 */
 async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
  // We'll hold the last batch's frames or last batch's description for context
  let lastBatchContext = [];
  const audioSegments = [];
  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
    const batchStart = batchIndex * settings.batchWindowDuration;
    const batchEnd = batchStart + settings.batchWindowDuration;
    if (batchEnd > videoDuration) break; // Safety check
    // Capture frames for this batch
    const framePaths = [];
    for (let i = 0; i < settings.framesInBatch; i++) {
      const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
      captureVideoFrame(videoFilePath, t, frameFilePath);
      framePaths.push(frameFilePath);
    }
    // Use AI to describe this batch of frames, possibly providing some context
    let description = await describeBatchOfFrames(
      framePaths,
      lastBatchContext,
      settings.batchPrompt
    );
    console.log(`Batch #${batchIndex} description:\n${description}\n`);
    // Convert description to TTS
    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
    await textToSpeech(
      description,
      audioFilePath,
      settings.ttsModel,
      settings.ttsVoice,
      settings.ttsSpeedFactor
    );
    // Store segment info. We'll align the entire description at the start of the batch
    audioSegments.push({
      audioFile: audioFilePath,
      startTime: batchStart,
      description
    });
    // Update lastBatchContext so the next batch can keep track of what's previously seen
    lastBatchContext = {
      lastDescription: description,
      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
    };
  }
  // Combine all the audio segments into one track
  const outputAudioPath = path.join(
    settings.outputDir,
    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
  );
  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
  printStats(stats, settings);
  return {
    videoFile: videoFilePath,
    audioDescriptionFile: outputAudioPath
  };
 }
 /**
 * Describe a batch of frames using AI, optionally providing context (last batch's data).
 * @param {string[]} framePaths - Array of file paths for this batch's frames
@@ -493,7 +314,7 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
    stats.totalVisionInputCost += response.usage.prompt_tokens;
    stats.totalVisionOutputCost += response.usage.completion_tokens;
    stats.totalCost += response.usage.total_tokens;
-    
+
    return response.choices[0].message.content.trim();
  } catch (error) {
    console.error("Error describing batch of frames:", error);
@@ -501,13 +322,265 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
  }
 }
 // Modified function to prevent audio overlap
 async function generateAudioDescription(videoFilePath, options = {}) {
  // Merge provided options with defaults
  const settings = { ...defaultConfig, ...options };
  // Ensure temporary and output directories exist
  if (!fs.existsSync(settings.tempDir)) {
    fs.mkdirSync(settings.tempDir, { recursive: true });
  }
  if (!fs.existsSync(settings.outputDir)) {
    fs.mkdirSync(settings.outputDir, { recursive: true });
  }
  // Get video duration
  const videoDuration = getVideoDuration(videoFilePath);
  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
  console.log(`Video duration: ${videoDuration} seconds`);
  // If batchTimeMode is enabled, use the new approach
  if (settings.batchTimeMode) {
    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
  }
  // Calculate the number of frames to capture
  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
  // Context window to store previous frames
  const frameContext = [];
  // Array to store audio segment information
  const audioSegments = [];
  // Track our current time position (will be adjusted for audio overlap)
  let currentTimePosition = 0;
  // Track drift from the original schedule
  let timelineDrift = 0;
  const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
  // Process each frame
  for (let i = 0; i < totalFrames; i++) {
    // Calculate the ideal time position based on the original schedule
    const idealTimePosition = i * settings.captureIntervalSeconds;
    // Use the adjusted time position that accounts for previous audio durations
    const timePosition = currentTimePosition;
    // Calculate drift from the original schedule
    timelineDrift = timePosition - idealTimePosition;
    // Log if drift is becoming significant
    if (Math.abs(timelineDrift) > maxAllowableDrift) {
      console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
    }
    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
    // Capture frame at current time position (use the ideal time to capture the frame)
    captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
    console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
    // Add current frame to context
    const currentFrame = {
      index: i,
      path: frameFilePath,
      timePosition
    };
    frameContext.push(currentFrame);
    // Keep context window at specified size
    if (frameContext.length > settings.contextWindowSize) {
      frameContext.shift();
    }
    // Generate description
    let description;
    if (frameContext.length === 1) {
      // First frame - just describe what's in it
      description = await describeFrame(frameFilePath, settings.defaultPrompt);
    } else {
      // Compare with previous frame
      const previousFrame = frameContext[frameContext.length - 2];
      description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
    }
    console.log(`Description: ${description}`);
    // Generate speech from description
    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
    const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
    console.log(`Audio duration: ${audioDuration} seconds`);
    // Store segment information
    audioSegments.push({
      audioFile: audioFilePath,
      startTime: timePosition,
      duration: audioDuration,
      description
    });
    // Update the time position for the next iteration
    // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
    const bufferTime = 0.25;
    currentTimePosition = timePosition + audioDuration + bufferTime;
    // If we've fallen behind schedule, try to catch up (but don't skip content)
    const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
    if (currentTimePosition < nextIdealPosition) {
      console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
      currentTimePosition = nextIdealPosition;
      timelineDrift = 0; // Reset drift since we've caught up
    }
  }
  // Combine audio segments into final audio description track
  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
  // Clean up temporary files if desired
  // cleanupTempFiles(settings.tempDir);
  console.log(`\nAudio description generated: ${outputAudioPath}`);
  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
  printStats(stats, settings);
  return {
    videoFile: videoFilePath,
    audioDescriptionFile: outputAudioPath
  };
 }
 /**
- * Convert text to speech using AI with speed adjustment
+ * Generate audio description using the new "batch time" mode with overlap prevention.
 * @param {string} videoFilePath - Path to the input video file
 * @param {number} videoDuration - Duration of the video in seconds
 * @param {object} settings - The merged config and user options
 */
 async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
  // We'll hold the last batch's frames or last batch's description for context
  let lastBatchContext = [];
  const audioSegments = [];
  // Track our current time position (will be adjusted for audio overlap)
  let currentTimePosition = 0;
  // Track drift from the original schedule
  let timelineDrift = 0;
  const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
    // Calculate ideal batch timing based on configuration
    const idealBatchStart = batchIndex * settings.batchWindowDuration;
    // Use adjusted time position that accounts for previous audio durations
    const batchStart = currentTimePosition;
    // Calculate drift from the original schedule
    timelineDrift = batchStart - idealBatchStart;
    // Log if drift is becoming significant
    if (Math.abs(timelineDrift) > maxAllowableDrift) {
      console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
    }
    const batchEnd = idealBatchStart + settings.batchWindowDuration;
    if (batchEnd > videoDuration) break; // Safety check
    console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
    // Capture frames for this batch - use the ideal timing for frame capture
    const framePaths = [];
    for (let i = 0; i < settings.framesInBatch; i++) {
      const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
      captureVideoFrame(videoFilePath, t, frameFilePath);
      framePaths.push(frameFilePath);
    }
    // Use AI to describe this batch of frames, possibly providing some context
    let description = await describeBatchOfFrames(
      framePaths,
      lastBatchContext,
      settings.batchPrompt
    );
    console.log(`Batch #${batchIndex} description:\n${description}\n`);
    // Convert description to TTS
    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
    const audioDuration = await textToSpeech(
      description,
      audioFilePath,
      settings.ttsModel,
      settings.ttsVoice,
      settings.ttsSpeedFactor
    );
    console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
    // Store segment info with the adjusted start time
    audioSegments.push({
      audioFile: audioFilePath,
      startTime: batchStart,
      duration: audioDuration,
      description
    });
    // Update the time position for the next iteration
    // Add a small buffer (0.5 sec) between descriptions
    const bufferTime = 0.5;
    currentTimePosition = batchStart + audioDuration + bufferTime;
    // If we've fallen behind schedule, try to catch up (but don't skip content)
    const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
    if (currentTimePosition < nextIdealPosition) {
      console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
      currentTimePosition = nextIdealPosition;
      timelineDrift = 0; // Reset drift since we've caught up
    }
    // Update lastBatchContext so the next batch can keep track of what's previously seen
    lastBatchContext = {
      lastDescription: description,
      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
    };
  }
  // Combine all the audio segments into one track
  const outputAudioPath = path.join(
    settings.outputDir,
    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
  );
  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
  printStats(stats, settings);
  return {
    videoFile: videoFilePath,
    audioDescriptionFile: outputAudioPath
  };
 }
 /**
 * Convert text to speech using AI with speed adjustment, and return the actual duration
 * @param {string} text - Text to convert to speech
 * @param {string} outputPath - Output path for the audio file
 * @param {string} model - TTS model to use
 * @param {string} voice - Voice to use for TTS
 * @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
 * @returns {number} The actual duration of the generated audio in seconds
 */
 async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
  try {
@@ -531,7 +604,7 @@ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
    // Clean up temporary file
    fs.unlinkSync(tempOutputPath);
-    // Get audio duration to make sure we have accurate timing
+    // Get actual audio duration for accurate timing
    const audioDuration = getAudioDuration(outputPath);
    return audioDuration;
  } catch (error) {
@@ -854,7 +927,7 @@ function saveConfigToFile(filePath, config) {
    const configToSave = { ...config };
    const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h'];
    keysToExclude.forEach(key => delete configToSave[key]);
-    
+
    fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8');
    console.log(`Configuration saved to ${filePath}`);
  } catch (error) {
@@ -881,13 +954,13 @@ function printStats(stats, settings) {
      'tts-1-hd': 0.030    // $0.030 per 1K characters
    }
  };
-  
+
  // Calculate prices. The stats object contains amount of tokens.
  stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000;
  stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000;
  stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000;
  stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost;
-  
+
  // Print out the stats
  console.log('\n=== STATISTICS ===');
  console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`);
@@ -907,7 +980,7 @@ async function estimateCost(videoFilePath, options = {}) {
  // Calculate the number of frames to capture
  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
-  
+
  // Pricing constants (as of March 2025, update as needed)
  const pricing = {
    // OpenAI pricing (per 1000 tokens)
@@ -991,30 +1064,30 @@ async function estimateCost(videoFilePath, options = {}) {
 if (require.main === module) {
  // Parse command line arguments
  const argv = parseCommandLineArgs();
-  
+
  // Start with default config
  let config = { ...defaultConfig };
-  
+
  // If a config file is specified, load it
  if (argv.config) {
    const fileConfig = loadConfigFromFile(argv.config);
    config = { ...config, ...fileConfig };
  }
-  
+
  // Override with any command line arguments
  Object.keys(argv).forEach(key => {
-    if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' && 
+    if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
-        key !== 'estimate' && key !== 'help' && key !== 'version' && 
+      key !== 'estimate' && key !== 'help' && key !== 'version' &&
-        argv[key] !== undefined) {
+      argv[key] !== undefined) {
      config[key] = argv[key];
    }
  });
-  
+
  // Save configuration if requested
  if (argv.saveConfig) {
    saveConfigToFile(argv.saveConfig, config);
  }
-  
+
  // Check if a video file is provided
  if (argv._.length < 1) {
    console.error('Error: No video file specified');
@@ -1022,9 +1095,9 @@ if (require.main === module) {
    console.log('Use --help for more information');
    process.exit(1);
  }
-  
+
  const videoFilePath = argv._[0];
-  
+
  // Run estimation or full processing
  if (argv.estimate) {
    estimateCost(videoFilePath, config)