From 7e0b9cf2202e1627313d7a77aeba48609f172a1c Mon Sep 17 00:00:00 2001 From: Talon Date: Tue, 11 Mar 2025 20:58:49 +0100 Subject: [PATCH] Schedule descriptions later if audio files are too long --- index.js | 465 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 269 insertions(+), 196 deletions(-) diff --git a/index.js b/index.js index f65a83d..8b6641f 100644 --- a/index.js +++ b/index.js @@ -139,106 +139,6 @@ function parseCommandLineArgs() { .argv; } -/** - * Main function to process a video file and generate audio descriptions - * @param {string} videoFilePath - Path to the input video file - * @param {object} options - Optional configuration overrides - */ -async function generateAudioDescription(videoFilePath, options = {}) { - // Merge provided options with defaults - const settings = { ...defaultConfig, ...options }; - - // Ensure temporary and output directories exist - if (!fs.existsSync(settings.tempDir)) { - fs.mkdirSync(settings.tempDir, { recursive: true }); - } - if (!fs.existsSync(settings.outputDir)) { - fs.mkdirSync(settings.outputDir, { recursive: true }); - } - - // Get video duration - const videoDuration = getVideoDuration(videoFilePath); - stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); - console.log(`Video duration: ${videoDuration} seconds`); - - // If batchTimeMode is enabled, use the new approach - if (settings.batchTimeMode) { - return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings); - } - // Calculate the number of frames to capture - const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); - console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`); - - // Context window to store previous frames - const frameContext = []; - - // Array to store audio segment information - const audioSegments = []; - - // Process each frame - for (let i = 0; i < totalFrames; i++) { - const timePosition = i * settings.captureIntervalSeconds; - const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`); - - // Capture frame at current time position - captureVideoFrame(videoFilePath, timePosition, frameFilePath); - console.log(`Captured frame at ${timePosition} seconds`); - - // Add current frame to context - const currentFrame = { - index: i, - path: frameFilePath, - timePosition - }; - - frameContext.push(currentFrame); - - // Keep context window at specified size - if (frameContext.length > settings.contextWindowSize) { - frameContext.shift(); - } - - // Generate description - let description; - if (frameContext.length === 1) { - // First frame - just describe what's in it - description = await describeFrame(frameFilePath, settings.defaultPrompt); - } else { - // Compare with previous frame - const previousFrame = frameContext[frameContext.length - 2]; - description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt); - } - - console.log(`Description: ${description}`); - - // Generate speech from description - const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`); - await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2); - - // Store segment information - audioSegments.push({ - audioFile: audioFilePath, - startTime: timePosition, - description - }); - } - - // Combine audio segments into final audio description track - const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`); - combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings); - - // Clean up temporary files if desired - // cleanupTempFiles(settings.tempDir); - - console.log(`\nAudio description generated: ${outputAudioPath}`); - console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); - printStats(stats, settings); - - return { - videoFile: videoFilePath, - audioDescriptionFile: outputAudioPath - }; -} /** * Get the duration of a video file in seconds @@ -361,85 +261,6 @@ async function describeFrameChange(previousFramePath, currentFramePath, prompt) } } -/** - * Generate audio description using the new "batch time" mode. - * @param {string} videoFilePath - Path to the input video file - * @param {number} videoDuration - Duration of the video in seconds - * @param {object} settings - The merged config and user options - */ -async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) { - const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration); - console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`); - - // We'll hold the last batch's frames or last batch's description for context - let lastBatchContext = []; - - const audioSegments = []; - - for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) { - const batchStart = batchIndex * settings.batchWindowDuration; - const batchEnd = batchStart + settings.batchWindowDuration; - if (batchEnd > videoDuration) break; // Safety check - - // Capture frames for this batch - const framePaths = []; - for (let i = 0; i < settings.framesInBatch; i++) { - const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch; - const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`); - captureVideoFrame(videoFilePath, t, frameFilePath); - framePaths.push(frameFilePath); - } - - // Use AI to describe this batch of frames, possibly providing some context - let description = await describeBatchOfFrames( - framePaths, - lastBatchContext, - settings.batchPrompt - ); - - console.log(`Batch #${batchIndex} description:\n${description}\n`); - - // Convert description to TTS - const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`); - await textToSpeech( - description, - audioFilePath, - settings.ttsModel, - settings.ttsVoice, - settings.ttsSpeedFactor - ); - - // Store segment info. We'll align the entire description at the start of the batch - audioSegments.push({ - audioFile: audioFilePath, - startTime: batchStart, - description - }); - - // Update lastBatchContext so the next batch can keep track of what's previously seen - lastBatchContext = { - lastDescription: description, - lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch - }; - } - - // Combine all the audio segments into one track - const outputAudioPath = path.join( - settings.outputDir, - `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3` - ); - combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings); - - console.log(`\nBatch audio description generated: ${outputAudioPath}`); - console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); - printStats(stats, settings); - - return { - videoFile: videoFilePath, - audioDescriptionFile: outputAudioPath - }; -} - /** * Describe a batch of frames using AI, optionally providing context (last batch's data). * @param {string[]} framePaths - Array of file paths for this batch's frames @@ -493,7 +314,7 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt) stats.totalVisionInputCost += response.usage.prompt_tokens; stats.totalVisionOutputCost += response.usage.completion_tokens; stats.totalCost += response.usage.total_tokens; - + return response.choices[0].message.content.trim(); } catch (error) { console.error("Error describing batch of frames:", error); @@ -501,13 +322,265 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt) } } +// Modified function to prevent audio overlap +async function generateAudioDescription(videoFilePath, options = {}) { + // Merge provided options with defaults + const settings = { ...defaultConfig, ...options }; + + // Ensure temporary and output directories exist + if (!fs.existsSync(settings.tempDir)) { + fs.mkdirSync(settings.tempDir, { recursive: true }); + } + if (!fs.existsSync(settings.outputDir)) { + fs.mkdirSync(settings.outputDir, { recursive: true }); + } + + // Get video duration + const videoDuration = getVideoDuration(videoFilePath); + stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); + console.log(`Video duration: ${videoDuration} seconds`); + + // If batchTimeMode is enabled, use the new approach + if (settings.batchTimeMode) { + return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings); + } + + // Calculate the number of frames to capture + const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); + console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`); + + // Context window to store previous frames + const frameContext = []; + + // Array to store audio segment information + const audioSegments = []; + + // Track our current time position (will be adjusted for audio overlap) + let currentTimePosition = 0; + + // Track drift from the original schedule + let timelineDrift = 0; + const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning + + // Process each frame + for (let i = 0; i < totalFrames; i++) { + // Calculate the ideal time position based on the original schedule + const idealTimePosition = i * settings.captureIntervalSeconds; + + // Use the adjusted time position that accounts for previous audio durations + const timePosition = currentTimePosition; + + // Calculate drift from the original schedule + timelineDrift = timePosition - idealTimePosition; + + // Log if drift is becoming significant + if (Math.abs(timelineDrift) > maxAllowableDrift) { + console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`); + } + + const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`); + + // Capture frame at current time position (use the ideal time to capture the frame) + captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath); + console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`); + + // Add current frame to context + const currentFrame = { + index: i, + path: frameFilePath, + timePosition + }; + + frameContext.push(currentFrame); + + // Keep context window at specified size + if (frameContext.length > settings.contextWindowSize) { + frameContext.shift(); + } + + // Generate description + let description; + if (frameContext.length === 1) { + // First frame - just describe what's in it + description = await describeFrame(frameFilePath, settings.defaultPrompt); + } else { + // Compare with previous frame + const previousFrame = frameContext[frameContext.length - 2]; + description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt); + } + + console.log(`Description: ${description}`); + + // Generate speech from description + const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`); + const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2); + + console.log(`Audio duration: ${audioDuration} seconds`); + + // Store segment information + audioSegments.push({ + audioFile: audioFilePath, + startTime: timePosition, + duration: audioDuration, + description + }); + + // Update the time position for the next iteration + // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts + const bufferTime = 0.25; + currentTimePosition = timePosition + audioDuration + bufferTime; + + // If we've fallen behind schedule, try to catch up (but don't skip content) + const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds; + if (currentTimePosition < nextIdealPosition) { + console.log(`Audio finished before next scheduled frame. Catching up with timeline.`); + currentTimePosition = nextIdealPosition; + timelineDrift = 0; // Reset drift since we've caught up + } + } + + // Combine audio segments into final audio description track + const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`); + combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings); + + // Clean up temporary files if desired + // cleanupTempFiles(settings.tempDir); + + console.log(`\nAudio description generated: ${outputAudioPath}`); + console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); + printStats(stats, settings); + + return { + videoFile: videoFilePath, + audioDescriptionFile: outputAudioPath + }; +} + /** - * Convert text to speech using AI with speed adjustment + * Generate audio description using the new "batch time" mode with overlap prevention. + * @param {string} videoFilePath - Path to the input video file + * @param {number} videoDuration - Duration of the video in seconds + * @param {object} settings - The merged config and user options + */ +async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) { + const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration); + console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`); + + // We'll hold the last batch's frames or last batch's description for context + let lastBatchContext = []; + + const audioSegments = []; + + // Track our current time position (will be adjusted for audio overlap) + let currentTimePosition = 0; + + // Track drift from the original schedule + let timelineDrift = 0; + const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window + + for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) { + // Calculate ideal batch timing based on configuration + const idealBatchStart = batchIndex * settings.batchWindowDuration; + + // Use adjusted time position that accounts for previous audio durations + const batchStart = currentTimePosition; + + // Calculate drift from the original schedule + timelineDrift = batchStart - idealBatchStart; + + // Log if drift is becoming significant + if (Math.abs(timelineDrift) > maxAllowableDrift) { + console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`); + } + + const batchEnd = idealBatchStart + settings.batchWindowDuration; + if (batchEnd > videoDuration) break; // Safety check + + console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`); + + // Capture frames for this batch - use the ideal timing for frame capture + const framePaths = []; + for (let i = 0; i < settings.framesInBatch; i++) { + const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch; + const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`); + captureVideoFrame(videoFilePath, t, frameFilePath); + framePaths.push(frameFilePath); + } + + // Use AI to describe this batch of frames, possibly providing some context + let description = await describeBatchOfFrames( + framePaths, + lastBatchContext, + settings.batchPrompt + ); + + console.log(`Batch #${batchIndex} description:\n${description}\n`); + + // Convert description to TTS + const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`); + const audioDuration = await textToSpeech( + description, + audioFilePath, + settings.ttsModel, + settings.ttsVoice, + settings.ttsSpeedFactor + ); + + console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`); + + // Store segment info with the adjusted start time + audioSegments.push({ + audioFile: audioFilePath, + startTime: batchStart, + duration: audioDuration, + description + }); + + // Update the time position for the next iteration + // Add a small buffer (0.5 sec) between descriptions + const bufferTime = 0.5; + currentTimePosition = batchStart + audioDuration + bufferTime; + + // If we've fallen behind schedule, try to catch up (but don't skip content) + const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration; + if (currentTimePosition < nextIdealPosition) { + console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`); + currentTimePosition = nextIdealPosition; + timelineDrift = 0; // Reset drift since we've caught up + } + + // Update lastBatchContext so the next batch can keep track of what's previously seen + lastBatchContext = { + lastDescription: description, + lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch + }; + } + + // Combine all the audio segments into one track + const outputAudioPath = path.join( + settings.outputDir, + `${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3` + ); + combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings); + + console.log(`\nBatch audio description generated: ${outputAudioPath}`); + console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); + printStats(stats, settings); + + return { + videoFile: videoFilePath, + audioDescriptionFile: outputAudioPath + }; +} + +/** + * Convert text to speech using AI with speed adjustment, and return the actual duration * @param {string} text - Text to convert to speech * @param {string} outputPath - Output path for the audio file * @param {string} model - TTS model to use * @param {string} voice - Voice to use for TTS * @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed) + * @returns {number} The actual duration of the generated audio in seconds */ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) { try { @@ -531,7 +604,7 @@ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) { // Clean up temporary file fs.unlinkSync(tempOutputPath); - // Get audio duration to make sure we have accurate timing + // Get actual audio duration for accurate timing const audioDuration = getAudioDuration(outputPath); return audioDuration; } catch (error) { @@ -854,7 +927,7 @@ function saveConfigToFile(filePath, config) { const configToSave = { ...config }; const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h']; keysToExclude.forEach(key => delete configToSave[key]); - + fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8'); console.log(`Configuration saved to ${filePath}`); } catch (error) { @@ -881,13 +954,13 @@ function printStats(stats, settings) { 'tts-1-hd': 0.030 // $0.030 per 1K characters } }; - + // Calculate prices. The stats object contains amount of tokens. stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000; stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000; stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000; stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost; - + // Print out the stats console.log('\n=== STATISTICS ==='); console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`); @@ -907,7 +980,7 @@ async function estimateCost(videoFilePath, options = {}) { // Calculate the number of frames to capture const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`); - + // Pricing constants (as of March 2025, update as needed) const pricing = { // OpenAI pricing (per 1000 tokens) @@ -991,30 +1064,30 @@ async function estimateCost(videoFilePath, options = {}) { if (require.main === module) { // Parse command line arguments const argv = parseCommandLineArgs(); - + // Start with default config let config = { ...defaultConfig }; - + // If a config file is specified, load it if (argv.config) { const fileConfig = loadConfigFromFile(argv.config); config = { ...config, ...fileConfig }; } - + // Override with any command line arguments Object.keys(argv).forEach(key => { - if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' && - key !== 'estimate' && key !== 'help' && key !== 'version' && - argv[key] !== undefined) { + if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' && + key !== 'estimate' && key !== 'help' && key !== 'version' && + argv[key] !== undefined) { config[key] = argv[key]; } }); - + // Save configuration if requested if (argv.saveConfig) { saveConfigToFile(argv.saveConfig, config); } - + // Check if a video file is provided if (argv._.length < 1) { console.error('Error: No video file specified'); @@ -1022,9 +1095,9 @@ if (require.main === module) { console.log('Use --help for more information'); process.exit(1); } - + const videoFilePath = argv._[0]; - + // Run estimation or full processing if (argv.estimate) { estimateCost(videoFilePath, config)