Schedule descriptions later if audio files are too long

2025-03-11 20:58:49 +01:00
parent a6cb8efc0c
commit 7e0b9cf220
1 changed files with 269 additions and 196 deletions
--- a/index.js
+++ b/index.js
@@ -139,106 +139,6 @@ function parseCommandLineArgs() {
    .argv;
 }

-/**
- * Main function to process a video file and generate audio descriptions
- * @param {string} videoFilePath - Path to the input video file
- * @param {object} options - Optional configuration overrides
- */
-async function generateAudioDescription(videoFilePath, options = {}) {
-  // Merge provided options with defaults
-  const settings = { ...defaultConfig, ...options };
-
-  // Ensure temporary and output directories exist
-  if (!fs.existsSync(settings.tempDir)) {
-    fs.mkdirSync(settings.tempDir, { recursive: true });
-  }
-  if (!fs.existsSync(settings.outputDir)) {
-    fs.mkdirSync(settings.outputDir, { recursive: true });
-  }
-
-  // Get video duration
-  const videoDuration = getVideoDuration(videoFilePath);
-  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
-  console.log(`Video duration: ${videoDuration} seconds`);
-
-  // If batchTimeMode is enabled, use the new approach
-  if (settings.batchTimeMode) {
-    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
-  }
-  // Calculate the number of frames to capture
-  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
-  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
-
-  // Context window to store previous frames
-  const frameContext = [];
-
-  // Array to store audio segment information
-  const audioSegments = [];
-
-  // Process each frame
-  for (let i = 0; i < totalFrames; i++) {
-    const timePosition = i * settings.captureIntervalSeconds;
-    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
-
-    // Capture frame at current time position
-    captureVideoFrame(videoFilePath, timePosition, frameFilePath);
-    console.log(`Captured frame at ${timePosition} seconds`);
-
-    // Add current frame to context
-    const currentFrame = {
-      index: i,
-      path: frameFilePath,
-      timePosition
-    };
-
-    frameContext.push(currentFrame);
-
-    // Keep context window at specified size
-    if (frameContext.length > settings.contextWindowSize) {
-      frameContext.shift();
-    }
-
-    // Generate description
-    let description;
-    if (frameContext.length === 1) {
-      // First frame - just describe what's in it
-      description = await describeFrame(frameFilePath, settings.defaultPrompt);
-    } else {
-      // Compare with previous frame
-      const previousFrame = frameContext[frameContext.length - 2];
-      description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
-    }
-
-    console.log(`Description: ${description}`);
-
-    // Generate speech from description
-    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
-    await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
-
-    // Store segment information
-    audioSegments.push({
-      audioFile: audioFilePath,
-      startTime: timePosition,
-      description
-    });
-  }
-
-  // Combine audio segments into final audio description track
-  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
-  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
-
-  // Clean up temporary files if desired
-  // cleanupTempFiles(settings.tempDir);
-
-  console.log(`\nAudio description generated: ${outputAudioPath}`);
-  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
-  printStats(stats, settings);
-
-  return {
-    videoFile: videoFilePath,
-    audioDescriptionFile: outputAudioPath
-  };
-}

 /**
 * Get the duration of a video file in seconds
@@ -361,85 +261,6 @@ async function describeFrameChange(previousFramePath, currentFramePath, prompt)
  }
 }

-/**
- * Generate audio description using the new "batch time" mode.
- * @param {string} videoFilePath - Path to the input video file
- * @param {number} videoDuration - Duration of the video in seconds
- * @param {object} settings - The merged config and user options
- */
-async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
-  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
-  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
-
-  // We'll hold the last batch's frames or last batch's description for context
-  let lastBatchContext = [];
-
-  const audioSegments = [];
-
-  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
-    const batchStart = batchIndex * settings.batchWindowDuration;
-    const batchEnd = batchStart + settings.batchWindowDuration;
-    if (batchEnd > videoDuration) break; // Safety check
-
-    // Capture frames for this batch
-    const framePaths = [];
-    for (let i = 0; i < settings.framesInBatch; i++) {
-      const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
-      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
-      captureVideoFrame(videoFilePath, t, frameFilePath);
-      framePaths.push(frameFilePath);
-    }
-
-    // Use AI to describe this batch of frames, possibly providing some context
-    let description = await describeBatchOfFrames(
-      framePaths,
-      lastBatchContext,
-      settings.batchPrompt
-    );
-
-    console.log(`Batch #${batchIndex} description:\n${description}\n`);
-
-    // Convert description to TTS
-    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
-    await textToSpeech(
-      description,
-      audioFilePath,
-      settings.ttsModel,
-      settings.ttsVoice,
-      settings.ttsSpeedFactor
-    );
-
-    // Store segment info. We'll align the entire description at the start of the batch
-    audioSegments.push({
-      audioFile: audioFilePath,
-      startTime: batchStart,
-      description
-    });
-
-    // Update lastBatchContext so the next batch can keep track of what's previously seen
-    lastBatchContext = {
-      lastDescription: description,
-      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
-    };
-  }
-
-  // Combine all the audio segments into one track
-  const outputAudioPath = path.join(
-    settings.outputDir,
-    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
-  );
-  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
-
-  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
-  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
-  printStats(stats, settings);
-
-  return {
-    videoFile: videoFilePath,
-    audioDescriptionFile: outputAudioPath
-  };
-}
-
 /**
 * Describe a batch of frames using AI, optionally providing context (last batch's data).
 * @param {string[]} framePaths - Array of file paths for this batch's frames
@@ -501,13 +322,265 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
  }
 }

+// Modified function to prevent audio overlap
+async function generateAudioDescription(videoFilePath, options = {}) {
+  // Merge provided options with defaults
+  const settings = { ...defaultConfig, ...options };
+
+  // Ensure temporary and output directories exist
+  if (!fs.existsSync(settings.tempDir)) {
+    fs.mkdirSync(settings.tempDir, { recursive: true });
+  }
+  if (!fs.existsSync(settings.outputDir)) {
+    fs.mkdirSync(settings.outputDir, { recursive: true });
+  }
+
+  // Get video duration
+  const videoDuration = getVideoDuration(videoFilePath);
+  stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+  console.log(`Video duration: ${videoDuration} seconds`);
+
+  // If batchTimeMode is enabled, use the new approach
+  if (settings.batchTimeMode) {
+    return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
+  }
+
+  // Calculate the number of frames to capture
+  const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+  console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
+
+  // Context window to store previous frames
+  const frameContext = [];
+
+  // Array to store audio segment information
+  const audioSegments = [];
+
+  // Track our current time position (will be adjusted for audio overlap)
+  let currentTimePosition = 0;
+
+  // Track drift from the original schedule
+  let timelineDrift = 0;
+  const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
+
+  // Process each frame
+  for (let i = 0; i < totalFrames; i++) {
+    // Calculate the ideal time position based on the original schedule
+    const idealTimePosition = i * settings.captureIntervalSeconds;
+
+    // Use the adjusted time position that accounts for previous audio durations
+    const timePosition = currentTimePosition;
+
+    // Calculate drift from the original schedule
+    timelineDrift = timePosition - idealTimePosition;
+
+    // Log if drift is becoming significant
+    if (Math.abs(timelineDrift) > maxAllowableDrift) {
+      console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
+    }
+
+    const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
+
+    // Capture frame at current time position (use the ideal time to capture the frame)
+    captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
+    console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
+
+    // Add current frame to context
+    const currentFrame = {
+      index: i,
+      path: frameFilePath,
+      timePosition
+    };
+
+    frameContext.push(currentFrame);
+
+    // Keep context window at specified size
+    if (frameContext.length > settings.contextWindowSize) {
+      frameContext.shift();
+    }
+
+    // Generate description
+    let description;
+    if (frameContext.length === 1) {
+      // First frame - just describe what's in it
+      description = await describeFrame(frameFilePath, settings.defaultPrompt);
+    } else {
+      // Compare with previous frame
+      const previousFrame = frameContext[frameContext.length - 2];
+      description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
+    }
+
+    console.log(`Description: ${description}`);
+
+    // Generate speech from description
+    const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
+    const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
+
+    console.log(`Audio duration: ${audioDuration} seconds`);
+
+    // Store segment information
+    audioSegments.push({
+      audioFile: audioFilePath,
+      startTime: timePosition,
+      duration: audioDuration,
+      description
+    });
+
+    // Update the time position for the next iteration
+    // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
+    const bufferTime = 0.25;
+    currentTimePosition = timePosition + audioDuration + bufferTime;
+
+    // If we've fallen behind schedule, try to catch up (but don't skip content)
+    const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
+    if (currentTimePosition < nextIdealPosition) {
+      console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
+      currentTimePosition = nextIdealPosition;
+      timelineDrift = 0; // Reset drift since we've caught up
+    }
+  }
+
+  // Combine audio segments into final audio description track
+  const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
+  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
+
+  // Clean up temporary files if desired
+  // cleanupTempFiles(settings.tempDir);
+
+  console.log(`\nAudio description generated: ${outputAudioPath}`);
+  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+  printStats(stats, settings);
+
+  return {
+    videoFile: videoFilePath,
+    audioDescriptionFile: outputAudioPath
+  };
+}
+
 /**
- * Convert text to speech using AI with speed adjustment
+ * Generate audio description using the new "batch time" mode with overlap prevention.
+ * @param {string} videoFilePath - Path to the input video file
+ * @param {number} videoDuration - Duration of the video in seconds
+ * @param {object} settings - The merged config and user options
+ */
+async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
+  const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
+  console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
+
+  // We'll hold the last batch's frames or last batch's description for context
+  let lastBatchContext = [];
+
+  const audioSegments = [];
+
+  // Track our current time position (will be adjusted for audio overlap)
+  let currentTimePosition = 0;
+
+  // Track drift from the original schedule
+  let timelineDrift = 0;
+  const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
+
+  for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
+    // Calculate ideal batch timing based on configuration
+    const idealBatchStart = batchIndex * settings.batchWindowDuration;
+
+    // Use adjusted time position that accounts for previous audio durations
+    const batchStart = currentTimePosition;
+
+    // Calculate drift from the original schedule
+    timelineDrift = batchStart - idealBatchStart;
+
+    // Log if drift is becoming significant
+    if (Math.abs(timelineDrift) > maxAllowableDrift) {
+      console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
+    }
+
+    const batchEnd = idealBatchStart + settings.batchWindowDuration;
+    if (batchEnd > videoDuration) break; // Safety check
+
+    console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
+
+    // Capture frames for this batch - use the ideal timing for frame capture
+    const framePaths = [];
+    for (let i = 0; i < settings.framesInBatch; i++) {
+      const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
+      const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
+      captureVideoFrame(videoFilePath, t, frameFilePath);
+      framePaths.push(frameFilePath);
+    }
+
+    // Use AI to describe this batch of frames, possibly providing some context
+    let description = await describeBatchOfFrames(
+      framePaths,
+      lastBatchContext,
+      settings.batchPrompt
+    );
+
+    console.log(`Batch #${batchIndex} description:\n${description}\n`);
+
+    // Convert description to TTS
+    const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
+    const audioDuration = await textToSpeech(
+      description,
+      audioFilePath,
+      settings.ttsModel,
+      settings.ttsVoice,
+      settings.ttsSpeedFactor
+    );
+
+    console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
+
+    // Store segment info with the adjusted start time
+    audioSegments.push({
+      audioFile: audioFilePath,
+      startTime: batchStart,
+      duration: audioDuration,
+      description
+    });
+
+    // Update the time position for the next iteration
+    // Add a small buffer (0.5 sec) between descriptions
+    const bufferTime = 0.5;
+    currentTimePosition = batchStart + audioDuration + bufferTime;
+
+    // If we've fallen behind schedule, try to catch up (but don't skip content)
+    const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
+    if (currentTimePosition < nextIdealPosition) {
+      console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
+      currentTimePosition = nextIdealPosition;
+      timelineDrift = 0; // Reset drift since we've caught up
+    }
+
+    // Update lastBatchContext so the next batch can keep track of what's previously seen
+    lastBatchContext = {
+      lastDescription: description,
+      lastFramePaths: framePaths.slice(-2)   // keep the last 2 frames from this batch
+    };
+  }
+
+  // Combine all the audio segments into one track
+  const outputAudioPath = path.join(
+    settings.outputDir,
+    `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
+  );
+  combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
+
+  console.log(`\nBatch audio description generated: ${outputAudioPath}`);
+  console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+  printStats(stats, settings);
+
+  return {
+    videoFile: videoFilePath,
+    audioDescriptionFile: outputAudioPath
+  };
+}
+
+/**
+ * Convert text to speech using AI with speed adjustment, and return the actual duration
 * @param {string} text - Text to convert to speech
 * @param {string} outputPath - Output path for the audio file
 * @param {string} model - TTS model to use
 * @param {string} voice - Voice to use for TTS
 * @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
+ * @returns {number} The actual duration of the generated audio in seconds
 */
 async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
  try {
@@ -531,7 +604,7 @@ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
    // Clean up temporary file
    fs.unlinkSync(tempOutputPath);

-    // Get audio duration to make sure we have accurate timing
+    // Get actual audio duration for accurate timing
    const audioDuration = getAudioDuration(outputPath);
    return audioDuration;
  } catch (error) {