Schedule descriptions later if audio files are too long
parent
a6cb8efc0c
commit
7e0b9cf220
465
index.js
465
index.js
|
@ -139,106 +139,6 @@ function parseCommandLineArgs() {
|
||||||
.argv;
|
.argv;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Main function to process a video file and generate audio descriptions
|
|
||||||
* @param {string} videoFilePath - Path to the input video file
|
|
||||||
* @param {object} options - Optional configuration overrides
|
|
||||||
*/
|
|
||||||
async function generateAudioDescription(videoFilePath, options = {}) {
|
|
||||||
// Merge provided options with defaults
|
|
||||||
const settings = { ...defaultConfig, ...options };
|
|
||||||
|
|
||||||
// Ensure temporary and output directories exist
|
|
||||||
if (!fs.existsSync(settings.tempDir)) {
|
|
||||||
fs.mkdirSync(settings.tempDir, { recursive: true });
|
|
||||||
}
|
|
||||||
if (!fs.existsSync(settings.outputDir)) {
|
|
||||||
fs.mkdirSync(settings.outputDir, { recursive: true });
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get video duration
|
|
||||||
const videoDuration = getVideoDuration(videoFilePath);
|
|
||||||
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
|
||||||
console.log(`Video duration: ${videoDuration} seconds`);
|
|
||||||
|
|
||||||
// If batchTimeMode is enabled, use the new approach
|
|
||||||
if (settings.batchTimeMode) {
|
|
||||||
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
|
|
||||||
}
|
|
||||||
// Calculate the number of frames to capture
|
|
||||||
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
|
||||||
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
|
||||||
|
|
||||||
// Context window to store previous frames
|
|
||||||
const frameContext = [];
|
|
||||||
|
|
||||||
// Array to store audio segment information
|
|
||||||
const audioSegments = [];
|
|
||||||
|
|
||||||
// Process each frame
|
|
||||||
for (let i = 0; i < totalFrames; i++) {
|
|
||||||
const timePosition = i * settings.captureIntervalSeconds;
|
|
||||||
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
|
|
||||||
|
|
||||||
// Capture frame at current time position
|
|
||||||
captureVideoFrame(videoFilePath, timePosition, frameFilePath);
|
|
||||||
console.log(`Captured frame at ${timePosition} seconds`);
|
|
||||||
|
|
||||||
// Add current frame to context
|
|
||||||
const currentFrame = {
|
|
||||||
index: i,
|
|
||||||
path: frameFilePath,
|
|
||||||
timePosition
|
|
||||||
};
|
|
||||||
|
|
||||||
frameContext.push(currentFrame);
|
|
||||||
|
|
||||||
// Keep context window at specified size
|
|
||||||
if (frameContext.length > settings.contextWindowSize) {
|
|
||||||
frameContext.shift();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate description
|
|
||||||
let description;
|
|
||||||
if (frameContext.length === 1) {
|
|
||||||
// First frame - just describe what's in it
|
|
||||||
description = await describeFrame(frameFilePath, settings.defaultPrompt);
|
|
||||||
} else {
|
|
||||||
// Compare with previous frame
|
|
||||||
const previousFrame = frameContext[frameContext.length - 2];
|
|
||||||
description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`Description: ${description}`);
|
|
||||||
|
|
||||||
// Generate speech from description
|
|
||||||
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
|
|
||||||
await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
|
|
||||||
|
|
||||||
// Store segment information
|
|
||||||
audioSegments.push({
|
|
||||||
audioFile: audioFilePath,
|
|
||||||
startTime: timePosition,
|
|
||||||
description
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Combine audio segments into final audio description track
|
|
||||||
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
|
|
||||||
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
|
||||||
|
|
||||||
// Clean up temporary files if desired
|
|
||||||
// cleanupTempFiles(settings.tempDir);
|
|
||||||
|
|
||||||
console.log(`\nAudio description generated: ${outputAudioPath}`);
|
|
||||||
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
|
||||||
printStats(stats, settings);
|
|
||||||
|
|
||||||
return {
|
|
||||||
videoFile: videoFilePath,
|
|
||||||
audioDescriptionFile: outputAudioPath
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the duration of a video file in seconds
|
* Get the duration of a video file in seconds
|
||||||
|
@ -361,85 +261,6 @@ async function describeFrameChange(previousFramePath, currentFramePath, prompt)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Generate audio description using the new "batch time" mode.
|
|
||||||
* @param {string} videoFilePath - Path to the input video file
|
|
||||||
* @param {number} videoDuration - Duration of the video in seconds
|
|
||||||
* @param {object} settings - The merged config and user options
|
|
||||||
*/
|
|
||||||
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
|
|
||||||
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
|
|
||||||
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
|
|
||||||
|
|
||||||
// We'll hold the last batch's frames or last batch's description for context
|
|
||||||
let lastBatchContext = [];
|
|
||||||
|
|
||||||
const audioSegments = [];
|
|
||||||
|
|
||||||
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
|
||||||
const batchStart = batchIndex * settings.batchWindowDuration;
|
|
||||||
const batchEnd = batchStart + settings.batchWindowDuration;
|
|
||||||
if (batchEnd > videoDuration) break; // Safety check
|
|
||||||
|
|
||||||
// Capture frames for this batch
|
|
||||||
const framePaths = [];
|
|
||||||
for (let i = 0; i < settings.framesInBatch; i++) {
|
|
||||||
const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
|
|
||||||
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
|
|
||||||
captureVideoFrame(videoFilePath, t, frameFilePath);
|
|
||||||
framePaths.push(frameFilePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Use AI to describe this batch of frames, possibly providing some context
|
|
||||||
let description = await describeBatchOfFrames(
|
|
||||||
framePaths,
|
|
||||||
lastBatchContext,
|
|
||||||
settings.batchPrompt
|
|
||||||
);
|
|
||||||
|
|
||||||
console.log(`Batch #${batchIndex} description:\n${description}\n`);
|
|
||||||
|
|
||||||
// Convert description to TTS
|
|
||||||
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
|
|
||||||
await textToSpeech(
|
|
||||||
description,
|
|
||||||
audioFilePath,
|
|
||||||
settings.ttsModel,
|
|
||||||
settings.ttsVoice,
|
|
||||||
settings.ttsSpeedFactor
|
|
||||||
);
|
|
||||||
|
|
||||||
// Store segment info. We'll align the entire description at the start of the batch
|
|
||||||
audioSegments.push({
|
|
||||||
audioFile: audioFilePath,
|
|
||||||
startTime: batchStart,
|
|
||||||
description
|
|
||||||
});
|
|
||||||
|
|
||||||
// Update lastBatchContext so the next batch can keep track of what's previously seen
|
|
||||||
lastBatchContext = {
|
|
||||||
lastDescription: description,
|
|
||||||
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Combine all the audio segments into one track
|
|
||||||
const outputAudioPath = path.join(
|
|
||||||
settings.outputDir,
|
|
||||||
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
|
|
||||||
);
|
|
||||||
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
|
||||||
|
|
||||||
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
|
|
||||||
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
|
||||||
printStats(stats, settings);
|
|
||||||
|
|
||||||
return {
|
|
||||||
videoFile: videoFilePath,
|
|
||||||
audioDescriptionFile: outputAudioPath
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Describe a batch of frames using AI, optionally providing context (last batch's data).
|
* Describe a batch of frames using AI, optionally providing context (last batch's data).
|
||||||
* @param {string[]} framePaths - Array of file paths for this batch's frames
|
* @param {string[]} framePaths - Array of file paths for this batch's frames
|
||||||
|
@ -493,7 +314,7 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
|
||||||
stats.totalVisionInputCost += response.usage.prompt_tokens;
|
stats.totalVisionInputCost += response.usage.prompt_tokens;
|
||||||
stats.totalVisionOutputCost += response.usage.completion_tokens;
|
stats.totalVisionOutputCost += response.usage.completion_tokens;
|
||||||
stats.totalCost += response.usage.total_tokens;
|
stats.totalCost += response.usage.total_tokens;
|
||||||
|
|
||||||
return response.choices[0].message.content.trim();
|
return response.choices[0].message.content.trim();
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error describing batch of frames:", error);
|
console.error("Error describing batch of frames:", error);
|
||||||
|
@ -501,13 +322,265 @@ async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Modified function to prevent audio overlap
|
||||||
|
async function generateAudioDescription(videoFilePath, options = {}) {
|
||||||
|
// Merge provided options with defaults
|
||||||
|
const settings = { ...defaultConfig, ...options };
|
||||||
|
|
||||||
|
// Ensure temporary and output directories exist
|
||||||
|
if (!fs.existsSync(settings.tempDir)) {
|
||||||
|
fs.mkdirSync(settings.tempDir, { recursive: true });
|
||||||
|
}
|
||||||
|
if (!fs.existsSync(settings.outputDir)) {
|
||||||
|
fs.mkdirSync(settings.outputDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get video duration
|
||||||
|
const videoDuration = getVideoDuration(videoFilePath);
|
||||||
|
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||||
|
console.log(`Video duration: ${videoDuration} seconds`);
|
||||||
|
|
||||||
|
// If batchTimeMode is enabled, use the new approach
|
||||||
|
if (settings.batchTimeMode) {
|
||||||
|
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the number of frames to capture
|
||||||
|
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||||
|
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||||||
|
|
||||||
|
// Context window to store previous frames
|
||||||
|
const frameContext = [];
|
||||||
|
|
||||||
|
// Array to store audio segment information
|
||||||
|
const audioSegments = [];
|
||||||
|
|
||||||
|
// Track our current time position (will be adjusted for audio overlap)
|
||||||
|
let currentTimePosition = 0;
|
||||||
|
|
||||||
|
// Track drift from the original schedule
|
||||||
|
let timelineDrift = 0;
|
||||||
|
const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
|
||||||
|
|
||||||
|
// Process each frame
|
||||||
|
for (let i = 0; i < totalFrames; i++) {
|
||||||
|
// Calculate the ideal time position based on the original schedule
|
||||||
|
const idealTimePosition = i * settings.captureIntervalSeconds;
|
||||||
|
|
||||||
|
// Use the adjusted time position that accounts for previous audio durations
|
||||||
|
const timePosition = currentTimePosition;
|
||||||
|
|
||||||
|
// Calculate drift from the original schedule
|
||||||
|
timelineDrift = timePosition - idealTimePosition;
|
||||||
|
|
||||||
|
// Log if drift is becoming significant
|
||||||
|
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||||||
|
console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
|
||||||
|
|
||||||
|
// Capture frame at current time position (use the ideal time to capture the frame)
|
||||||
|
captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
|
||||||
|
console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
|
||||||
|
|
||||||
|
// Add current frame to context
|
||||||
|
const currentFrame = {
|
||||||
|
index: i,
|
||||||
|
path: frameFilePath,
|
||||||
|
timePosition
|
||||||
|
};
|
||||||
|
|
||||||
|
frameContext.push(currentFrame);
|
||||||
|
|
||||||
|
// Keep context window at specified size
|
||||||
|
if (frameContext.length > settings.contextWindowSize) {
|
||||||
|
frameContext.shift();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate description
|
||||||
|
let description;
|
||||||
|
if (frameContext.length === 1) {
|
||||||
|
// First frame - just describe what's in it
|
||||||
|
description = await describeFrame(frameFilePath, settings.defaultPrompt);
|
||||||
|
} else {
|
||||||
|
// Compare with previous frame
|
||||||
|
const previousFrame = frameContext[frameContext.length - 2];
|
||||||
|
description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Description: ${description}`);
|
||||||
|
|
||||||
|
// Generate speech from description
|
||||||
|
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
|
||||||
|
const audioDuration = await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
|
||||||
|
|
||||||
|
console.log(`Audio duration: ${audioDuration} seconds`);
|
||||||
|
|
||||||
|
// Store segment information
|
||||||
|
audioSegments.push({
|
||||||
|
audioFile: audioFilePath,
|
||||||
|
startTime: timePosition,
|
||||||
|
duration: audioDuration,
|
||||||
|
description
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update the time position for the next iteration
|
||||||
|
// Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
|
||||||
|
const bufferTime = 0.25;
|
||||||
|
currentTimePosition = timePosition + audioDuration + bufferTime;
|
||||||
|
|
||||||
|
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||||||
|
const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
|
||||||
|
if (currentTimePosition < nextIdealPosition) {
|
||||||
|
console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
|
||||||
|
currentTimePosition = nextIdealPosition;
|
||||||
|
timelineDrift = 0; // Reset drift since we've caught up
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine audio segments into final audio description track
|
||||||
|
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
|
||||||
|
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||||||
|
|
||||||
|
// Clean up temporary files if desired
|
||||||
|
// cleanupTempFiles(settings.tempDir);
|
||||||
|
|
||||||
|
console.log(`\nAudio description generated: ${outputAudioPath}`);
|
||||||
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||||||
|
printStats(stats, settings);
|
||||||
|
|
||||||
|
return {
|
||||||
|
videoFile: videoFilePath,
|
||||||
|
audioDescriptionFile: outputAudioPath
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert text to speech using AI with speed adjustment
|
* Generate audio description using the new "batch time" mode with overlap prevention.
|
||||||
|
* @param {string} videoFilePath - Path to the input video file
|
||||||
|
* @param {number} videoDuration - Duration of the video in seconds
|
||||||
|
* @param {object} settings - The merged config and user options
|
||||||
|
*/
|
||||||
|
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
|
||||||
|
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||||
|
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
|
||||||
|
|
||||||
|
// We'll hold the last batch's frames or last batch's description for context
|
||||||
|
let lastBatchContext = [];
|
||||||
|
|
||||||
|
const audioSegments = [];
|
||||||
|
|
||||||
|
// Track our current time position (will be adjusted for audio overlap)
|
||||||
|
let currentTimePosition = 0;
|
||||||
|
|
||||||
|
// Track drift from the original schedule
|
||||||
|
let timelineDrift = 0;
|
||||||
|
const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
|
||||||
|
|
||||||
|
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
||||||
|
// Calculate ideal batch timing based on configuration
|
||||||
|
const idealBatchStart = batchIndex * settings.batchWindowDuration;
|
||||||
|
|
||||||
|
// Use adjusted time position that accounts for previous audio durations
|
||||||
|
const batchStart = currentTimePosition;
|
||||||
|
|
||||||
|
// Calculate drift from the original schedule
|
||||||
|
timelineDrift = batchStart - idealBatchStart;
|
||||||
|
|
||||||
|
// Log if drift is becoming significant
|
||||||
|
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||||||
|
console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const batchEnd = idealBatchStart + settings.batchWindowDuration;
|
||||||
|
if (batchEnd > videoDuration) break; // Safety check
|
||||||
|
|
||||||
|
console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
|
||||||
|
|
||||||
|
// Capture frames for this batch - use the ideal timing for frame capture
|
||||||
|
const framePaths = [];
|
||||||
|
for (let i = 0; i < settings.framesInBatch; i++) {
|
||||||
|
const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
|
||||||
|
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
|
||||||
|
captureVideoFrame(videoFilePath, t, frameFilePath);
|
||||||
|
framePaths.push(frameFilePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use AI to describe this batch of frames, possibly providing some context
|
||||||
|
let description = await describeBatchOfFrames(
|
||||||
|
framePaths,
|
||||||
|
lastBatchContext,
|
||||||
|
settings.batchPrompt
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`Batch #${batchIndex} description:\n${description}\n`);
|
||||||
|
|
||||||
|
// Convert description to TTS
|
||||||
|
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
|
||||||
|
const audioDuration = await textToSpeech(
|
||||||
|
description,
|
||||||
|
audioFilePath,
|
||||||
|
settings.ttsModel,
|
||||||
|
settings.ttsVoice,
|
||||||
|
settings.ttsSpeedFactor
|
||||||
|
);
|
||||||
|
|
||||||
|
console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
|
||||||
|
|
||||||
|
// Store segment info with the adjusted start time
|
||||||
|
audioSegments.push({
|
||||||
|
audioFile: audioFilePath,
|
||||||
|
startTime: batchStart,
|
||||||
|
duration: audioDuration,
|
||||||
|
description
|
||||||
|
});
|
||||||
|
|
||||||
|
// Update the time position for the next iteration
|
||||||
|
// Add a small buffer (0.5 sec) between descriptions
|
||||||
|
const bufferTime = 0.5;
|
||||||
|
currentTimePosition = batchStart + audioDuration + bufferTime;
|
||||||
|
|
||||||
|
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||||||
|
const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
|
||||||
|
if (currentTimePosition < nextIdealPosition) {
|
||||||
|
console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
|
||||||
|
currentTimePosition = nextIdealPosition;
|
||||||
|
timelineDrift = 0; // Reset drift since we've caught up
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update lastBatchContext so the next batch can keep track of what's previously seen
|
||||||
|
lastBatchContext = {
|
||||||
|
lastDescription: description,
|
||||||
|
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine all the audio segments into one track
|
||||||
|
const outputAudioPath = path.join(
|
||||||
|
settings.outputDir,
|
||||||
|
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
|
||||||
|
);
|
||||||
|
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||||||
|
|
||||||
|
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
|
||||||
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||||||
|
printStats(stats, settings);
|
||||||
|
|
||||||
|
return {
|
||||||
|
videoFile: videoFilePath,
|
||||||
|
audioDescriptionFile: outputAudioPath
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert text to speech using AI with speed adjustment, and return the actual duration
|
||||||
* @param {string} text - Text to convert to speech
|
* @param {string} text - Text to convert to speech
|
||||||
* @param {string} outputPath - Output path for the audio file
|
* @param {string} outputPath - Output path for the audio file
|
||||||
* @param {string} model - TTS model to use
|
* @param {string} model - TTS model to use
|
||||||
* @param {string} voice - Voice to use for TTS
|
* @param {string} voice - Voice to use for TTS
|
||||||
* @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
|
* @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
|
||||||
|
* @returns {number} The actual duration of the generated audio in seconds
|
||||||
*/
|
*/
|
||||||
async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
|
async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
|
||||||
try {
|
try {
|
||||||
|
@ -531,7 +604,7 @@ async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
|
||||||
// Clean up temporary file
|
// Clean up temporary file
|
||||||
fs.unlinkSync(tempOutputPath);
|
fs.unlinkSync(tempOutputPath);
|
||||||
|
|
||||||
// Get audio duration to make sure we have accurate timing
|
// Get actual audio duration for accurate timing
|
||||||
const audioDuration = getAudioDuration(outputPath);
|
const audioDuration = getAudioDuration(outputPath);
|
||||||
return audioDuration;
|
return audioDuration;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -854,7 +927,7 @@ function saveConfigToFile(filePath, config) {
|
||||||
const configToSave = { ...config };
|
const configToSave = { ...config };
|
||||||
const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h'];
|
const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h'];
|
||||||
keysToExclude.forEach(key => delete configToSave[key]);
|
keysToExclude.forEach(key => delete configToSave[key]);
|
||||||
|
|
||||||
fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8');
|
fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8');
|
||||||
console.log(`Configuration saved to ${filePath}`);
|
console.log(`Configuration saved to ${filePath}`);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
@ -881,13 +954,13 @@ function printStats(stats, settings) {
|
||||||
'tts-1-hd': 0.030 // $0.030 per 1K characters
|
'tts-1-hd': 0.030 // $0.030 per 1K characters
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Calculate prices. The stats object contains amount of tokens.
|
// Calculate prices. The stats object contains amount of tokens.
|
||||||
stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000;
|
stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000;
|
||||||
stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000;
|
stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000;
|
||||||
stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000;
|
stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000;
|
||||||
stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost;
|
stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost;
|
||||||
|
|
||||||
// Print out the stats
|
// Print out the stats
|
||||||
console.log('\n=== STATISTICS ===');
|
console.log('\n=== STATISTICS ===');
|
||||||
console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`);
|
console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`);
|
||||||
|
@ -907,7 +980,7 @@ async function estimateCost(videoFilePath, options = {}) {
|
||||||
// Calculate the number of frames to capture
|
// Calculate the number of frames to capture
|
||||||
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||||
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||||||
|
|
||||||
// Pricing constants (as of March 2025, update as needed)
|
// Pricing constants (as of March 2025, update as needed)
|
||||||
const pricing = {
|
const pricing = {
|
||||||
// OpenAI pricing (per 1000 tokens)
|
// OpenAI pricing (per 1000 tokens)
|
||||||
|
@ -991,30 +1064,30 @@ async function estimateCost(videoFilePath, options = {}) {
|
||||||
if (require.main === module) {
|
if (require.main === module) {
|
||||||
// Parse command line arguments
|
// Parse command line arguments
|
||||||
const argv = parseCommandLineArgs();
|
const argv = parseCommandLineArgs();
|
||||||
|
|
||||||
// Start with default config
|
// Start with default config
|
||||||
let config = { ...defaultConfig };
|
let config = { ...defaultConfig };
|
||||||
|
|
||||||
// If a config file is specified, load it
|
// If a config file is specified, load it
|
||||||
if (argv.config) {
|
if (argv.config) {
|
||||||
const fileConfig = loadConfigFromFile(argv.config);
|
const fileConfig = loadConfigFromFile(argv.config);
|
||||||
config = { ...config, ...fileConfig };
|
config = { ...config, ...fileConfig };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Override with any command line arguments
|
// Override with any command line arguments
|
||||||
Object.keys(argv).forEach(key => {
|
Object.keys(argv).forEach(key => {
|
||||||
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||||||
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||||||
argv[key] !== undefined) {
|
argv[key] !== undefined) {
|
||||||
config[key] = argv[key];
|
config[key] = argv[key];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Save configuration if requested
|
// Save configuration if requested
|
||||||
if (argv.saveConfig) {
|
if (argv.saveConfig) {
|
||||||
saveConfigToFile(argv.saveConfig, config);
|
saveConfigToFile(argv.saveConfig, config);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if a video file is provided
|
// Check if a video file is provided
|
||||||
if (argv._.length < 1) {
|
if (argv._.length < 1) {
|
||||||
console.error('Error: No video file specified');
|
console.error('Error: No video file specified');
|
||||||
|
@ -1022,9 +1095,9 @@ if (require.main === module) {
|
||||||
console.log('Use --help for more information');
|
console.log('Use --help for more information');
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
const videoFilePath = argv._[0];
|
const videoFilePath = argv._[0];
|
||||||
|
|
||||||
// Run estimation or full processing
|
// Run estimation or full processing
|
||||||
if (argv.estimate) {
|
if (argv.estimate) {
|
||||||
estimateCost(videoFilePath, config)
|
estimateCost(videoFilePath, config)
|
||||||
|
|
Loading…
Reference in New Issue