Schedule descriptions later if audio files are too long

other-providers
Talon 2025-03-11 20:58:49 +01:00
parent a6cb8efc0c
commit 7e0b9cf220
1 changed files with 269 additions and 196 deletions

435
index.js
View File

@ -139,106 +139,6 @@ function parseCommandLineArgs() {
.argv;
}
/**
* Main function to process a video file and generate audio descriptions
* @param {string} videoFilePath - Path to the input video file
* @param {object} options - Optional configuration overrides
*/
async function generateAudioDescription(videoFilePath, options = {}) {
// Merge provided options with defaults
const settings = { ...defaultConfig, ...options };
// Ensure temporary and output directories exist
if (!fs.existsSync(settings.tempDir)) {
fs.mkdirSync(settings.tempDir, { recursive: true });
}
if (!fs.existsSync(settings.outputDir)) {
fs.mkdirSync(settings.outputDir, { recursive: true });
}
// Get video duration
const videoDuration = getVideoDuration(videoFilePath);
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
console.log(`Video duration: ${videoDuration} seconds`);
// If batchTimeMode is enabled, use the new approach
if (settings.batchTimeMode) {
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
}
// Calculate the number of frames to capture
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
// Context window to store previous frames
const frameContext = [];
// Array to store audio segment information
const audioSegments = [];
// Process each frame
for (let i = 0; i < totalFrames; i++) {
const timePosition = i * settings.captureIntervalSeconds;
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
// Capture frame at current time position
captureVideoFrame(videoFilePath, timePosition, frameFilePath);
console.log(`Captured frame at ${timePosition} seconds`);
// Add current frame to context
const currentFrame = {
index: i,
path: frameFilePath,
timePosition
};
frameContext.push(currentFrame);
// Keep context window at specified size
if (frameContext.length > settings.contextWindowSize) {
frameContext.shift();
}
// Generate description
let description;
if (frameContext.length === 1) {
// First frame - just describe what's in it
description = await describeFrame(frameFilePath, settings.defaultPrompt);
} else {
// Compare with previous frame
const previousFrame = frameContext[frameContext.length - 2];
description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
}
console.log(`Description: ${description}`);
// Generate speech from description
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
// Store segment information
audioSegments.push({
audioFile: audioFilePath,
startTime: timePosition,
description
});
}
// Combine audio segments into final audio description track
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
// Clean up temporary files if desired
// cleanupTempFiles(settings.tempDir);
console.log(`\nAudio description generated: ${outputAudioPath}`);
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
printStats(stats, settings);
return {
videoFile: videoFilePath,
audioDescriptionFile: outputAudioPath
};
}
/**
* Get the duration of a video file in seconds
@ -361,85 +261,6 @@ async function describeFrameChange(previousFramePath, currentFramePath, prompt)
}
}
/**
* Generate audio description using the new "batch time" mode.
* @param {string} videoFilePath - Path to the input video file
* @param {number} videoDuration - Duration of the video in seconds
* @param {object} settings - The merged config and user options
*/
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
// We'll hold the last batch's frames or last batch's description for context
let lastBatchContext = [];
const audioSegments = [];
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
const batchStart = batchIndex * settings.batchWindowDuration;
const batchEnd = batchStart + settings.batchWindowDuration;
if (batchEnd > videoDuration) break; // Safety check
// Capture frames for this batch
const framePaths = [];
for (let i = 0; i < settings.framesInBatch; i++) {
const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
captureVideoFrame(videoFilePath, t, frameFilePath);
framePaths.push(frameFilePath);
}
// Use AI to describe this batch of frames, possibly providing some context
let description = await describeBatchOfFrames(
framePaths,
lastBatchContext,
settings.batchPrompt
);
console.log(`Batch #${batchIndex} description:\n${description}\n`);
// Convert description to TTS
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
await textToSpeech(
description,
audioFilePath,
settings.ttsModel,
settings.ttsVoice,
settings.ttsSpeedFactor
);
// Store segment info. We'll align the entire description at the start of the batch
audioSegments.push({
audioFile: audioFilePath,
startTime: batchStart,
description
});
// Update lastBatchContext so the next batch can keep track of what's previously seen
lastBatchContext = {
lastDescription: description,
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
};
}
// Combine all the audio segments into one track
const outputAudioPath = path.join(
settings.outputDir,
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
);
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
printStats(stats, settings);
return {
videoFile: videoFilePath,
audioDescriptionFile: outputAudioPath
};
}
/**
* Describe a batch of frames using AI, optionally providing context (last batch's data).
* @param {string[]} framePaths - Array of file paths for this batch's frames
@ -501,13 +322,265 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
}
}
// Modified function to prevent audio overlap
async function generateAudioDescription(videoFilePath, options = {}) {
// Merge provided options with defaults
const settings = { ...defaultConfig, ...options };
// Ensure temporary and output directories exist
if (!fs.existsSync(settings.tempDir)) {
fs.mkdirSync(settings.tempDir, { recursive: true });
}
if (!fs.existsSync(settings.outputDir)) {
fs.mkdirSync(settings.outputDir, { recursive: true });
}
// Get video duration
const videoDuration = getVideoDuration(videoFilePath);
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
console.log(`Video duration: ${videoDuration} seconds`);
// If batchTimeMode is enabled, use the new approach
if (settings.batchTimeMode) {
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
}
// Calculate the number of frames to capture
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
// Context window to store previous frames
const frameContext = [];
// Array to store audio segment information
const audioSegments = [];
// Track our current time position (will be adjusted for audio overlap)
let currentTimePosition = 0;
// Track drift from the original schedule
let timelineDrift = 0;
const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
// Process each frame
for (let i = 0; i < totalFrames; i++) {
// Calculate the ideal time position based on the original schedule
const idealTimePosition = i * settings.captureIntervalSeconds;
// Use the adjusted time position that accounts for previous audio durations
const timePosition = currentTimePosition;
// Calculate drift from the original schedule
timelineDrift = timePosition - idealTimePosition;
// Log if drift is becoming significant
if (Math.abs(timelineDrift) > maxAllowableDrift) {
console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
}
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
// Capture frame at current time position (use the ideal time to capture the frame)
captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
// Add current frame to context
const currentFrame = {
index: i,
path: frameFilePath,
timePosition
};
frameContext.push(currentFrame);
// Keep context window at specified size
if (frameContext.length > settings.contextWindowSize) {
frameContext.shift();
}
// Generate description
let description;
if (frameContext.length === 1) {
// First frame - just describe what's in it
description = await describeFrame(frameFilePath, settings.defaultPrompt);
} else {
// Compare with previous frame
const previousFrame = frameContext[frameContext.length - 2];
description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
}
console.log(`Description: ${description}`);
// Generate speech from description
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
console.log(`Audio duration: ${audioDuration} seconds`);
// Store segment information
audioSegments.push({
audioFile: audioFilePath,
startTime: timePosition,
duration: audioDuration,
description
});
// Update the time position for the next iteration
// Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
const bufferTime = 0.25;
currentTimePosition = timePosition + audioDuration + bufferTime;
// If we've fallen behind schedule, try to catch up (but don't skip content)
const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
if (currentTimePosition < nextIdealPosition) {
console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
currentTimePosition = nextIdealPosition;
timelineDrift = 0; // Reset drift since we've caught up
}
}
// Combine audio segments into final audio description track
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
// Clean up temporary files if desired
// cleanupTempFiles(settings.tempDir);
console.log(`\nAudio description generated: ${outputAudioPath}`);
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
printStats(stats, settings);
return {
videoFile: videoFilePath,
audioDescriptionFile: outputAudioPath
};
}
/**
* Convert text to speech using AI with speed adjustment
* Generate audio description using the new "batch time" mode with overlap prevention.
* @param {string} videoFilePath - Path to the input video file
* @param {number} videoDuration - Duration of the video in seconds
* @param {object} settings - The merged config and user options
*/
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
// We'll hold the last batch's frames or last batch's description for context
let lastBatchContext = [];
const audioSegments = [];
// Track our current time position (will be adjusted for audio overlap)
let currentTimePosition = 0;
// Track drift from the original schedule
let timelineDrift = 0;
const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
// Calculate ideal batch timing based on configuration
const idealBatchStart = batchIndex * settings.batchWindowDuration;
// Use adjusted time position that accounts for previous audio durations
const batchStart = currentTimePosition;
// Calculate drift from the original schedule
timelineDrift = batchStart - idealBatchStart;
// Log if drift is becoming significant
if (Math.abs(timelineDrift) > maxAllowableDrift) {
console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
}
const batchEnd = idealBatchStart + settings.batchWindowDuration;
if (batchEnd > videoDuration) break; // Safety check
console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
// Capture frames for this batch - use the ideal timing for frame capture
const framePaths = [];
for (let i = 0; i < settings.framesInBatch; i++) {
const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
captureVideoFrame(videoFilePath, t, frameFilePath);
framePaths.push(frameFilePath);
}
// Use AI to describe this batch of frames, possibly providing some context
let description = await describeBatchOfFrames(
framePaths,
lastBatchContext,
settings.batchPrompt
);
console.log(`Batch #${batchIndex} description:\n${description}\n`);
// Convert description to TTS
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
const audioDuration = await textToSpeech(
description,
audioFilePath,
settings.ttsModel,
settings.ttsVoice,
settings.ttsSpeedFactor
);
console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
// Store segment info with the adjusted start time
audioSegments.push({
audioFile: audioFilePath,
startTime: batchStart,
duration: audioDuration,
description
});
// Update the time position for the next iteration
// Add a small buffer (0.5 sec) between descriptions
const bufferTime = 0.5;
currentTimePosition = batchStart + audioDuration + bufferTime;
// If we've fallen behind schedule, try to catch up (but don't skip content)
const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
if (currentTimePosition < nextIdealPosition) {
console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
currentTimePosition = nextIdealPosition;
timelineDrift = 0; // Reset drift since we've caught up
}
// Update lastBatchContext so the next batch can keep track of what's previously seen
lastBatchContext = {
lastDescription: description,
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
};
}
// Combine all the audio segments into one track
const outputAudioPath = path.join(
settings.outputDir,
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
);
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
printStats(stats, settings);
return {
videoFile: videoFilePath,
audioDescriptionFile: outputAudioPath
};
}
/**
* Convert text to speech using AI with speed adjustment, and return the actual duration
* @param {string} text - Text to convert to speech
* @param {string} outputPath - Output path for the audio file
* @param {string} model - TTS model to use
* @param {string} voice - Voice to use for TTS
* @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
* @returns {number} The actual duration of the generated audio in seconds
*/
async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
try {
@ -531,7 +604,7 @@ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
// Clean up temporary file
fs.unlinkSync(tempOutputPath);
// Get audio duration to make sure we have accurate timing
// Get actual audio duration for accurate timing
const audioDuration = getAudioDuration(outputPath);
return audioDuration;
} catch (error) {