1054 lines
39 KiB
JavaScript
1054 lines
39 KiB
JavaScript
|
const fs = require('fs');
|
||
|
const path = require('path');
|
||
|
const { execSync } = require('child_process');
|
||
|
const axios = require('axios');
|
||
|
const ffmpeg = require('fluent-ffmpeg');
|
||
|
const dotenv = require('dotenv');
|
||
|
const { OpenAI } = require('openai');
|
||
|
const yargs = require('yargs/yargs');
|
||
|
const { hideBin } = require('yargs/helpers');
|
||
|
|
||
|
// Load environment variables
|
||
|
dotenv.config();
|
||
|
|
||
|
// Initialize OpenAI client
|
||
|
const openai = new OpenAI({
|
||
|
apiKey: process.env.OPENAI_API_KEY,
|
||
|
});
|
||
|
|
||
|
// Default configuration options
|
||
|
const defaultConfig = {
|
||
|
captureIntervalSeconds: 10,
|
||
|
contextWindowSize: 5,
|
||
|
defaultPrompt: "Describe this frame from a video in 1-2 sentences for someone who cannot see it. Focus on key visual elements. Avoid using terms like 'in this frame', simply describe the actual frame. Keep sentences short and concise, as this will be used to generate an audio track which is overlayed on the video.",
|
||
|
changePrompt: "Describe what has changed between these frames in 1-2 sentences for someone who cannot see the video. Focus on significant visual changes only. Avoid talking about meta information such as 'in this frame', or 'the significant change is', and merely describe the actual change taking place. Only describe the changes relevant to the last frame. The previous frames are attached for you to build context and build situational awareness. Keep it short and concise, as your text will be used to generate audio description tracks to be played with the video.",
|
||
|
ttsModel: "tts-1-hd", // OpenAI's TTS model
|
||
|
ttsVoice: "alloy", // Voice option for TTS
|
||
|
ttsSpeedFactor: 1.5, // Speed up audio by 50%
|
||
|
outputDir: "/mnt/e/desc/output/",
|
||
|
tempDir: "/mnt/e/desc/temp/",
|
||
|
batchTimeMode: true, // Whether to use the new batch time mode
|
||
|
batchWindowDuration: 15, // How many seconds each batch covers
|
||
|
framesInBatch: 10, // How many frames to capture within each batch
|
||
|
batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description."
|
||
|
};
|
||
|
|
||
|
let stats = {
|
||
|
totalFrames: 0,
|
||
|
totalBatches: 0,
|
||
|
totalVisionInputCost: 0,
|
||
|
totalVisionOutputCost: 0,
|
||
|
totalTTSCost: 0,
|
||
|
totalCost: 0
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Parse command line arguments
|
||
|
*/
|
||
|
function parseCommandLineArgs() {
|
||
|
return yargs(hideBin(process.argv))
|
||
|
.usage('Usage: $0 <video_file_path> [options]')
|
||
|
.positional('video_file_path', {
|
||
|
describe: 'Path to the input video file',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('captureIntervalSeconds', {
|
||
|
alias: 'i',
|
||
|
describe: 'Interval in seconds between frame captures',
|
||
|
type: 'number'
|
||
|
})
|
||
|
.option('contextWindowSize', {
|
||
|
alias: 'c',
|
||
|
describe: 'Number of frames to keep in context',
|
||
|
type: 'number'
|
||
|
})
|
||
|
.option('ttsModel', {
|
||
|
alias: 'm',
|
||
|
describe: 'OpenAI TTS model to use',
|
||
|
choices: ['tts-1', 'tts-1-hd'],
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('ttsVoice', {
|
||
|
alias: 'v',
|
||
|
describe: 'Voice to use for text-to-speech',
|
||
|
choices: ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'],
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('ttsSpeedFactor', {
|
||
|
alias: 's',
|
||
|
describe: 'Speed factor for the audio playback',
|
||
|
type: 'number'
|
||
|
})
|
||
|
.option('outputDir', {
|
||
|
alias: 'o',
|
||
|
describe: 'Directory for output files',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('tempDir', {
|
||
|
alias: 't',
|
||
|
describe: 'Directory for temporary files',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('batchTimeMode', {
|
||
|
alias: 'b',
|
||
|
describe: 'Use batch time mode for processing',
|
||
|
type: 'boolean'
|
||
|
})
|
||
|
.option('batchWindowDuration', {
|
||
|
describe: 'Duration in seconds for each batch window',
|
||
|
type: 'number'
|
||
|
})
|
||
|
.option('framesInBatch', {
|
||
|
describe: 'Number of frames to capture within each batch',
|
||
|
type: 'number'
|
||
|
})
|
||
|
.option('defaultPrompt', {
|
||
|
describe: 'Prompt for describing individual frames',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('changePrompt', {
|
||
|
describe: 'Prompt for describing changes between frames',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('batchPrompt', {
|
||
|
describe: 'Prompt for describing batches of frames',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('estimate', {
|
||
|
alias: 'e',
|
||
|
describe: 'Only estimate the cost without generating the audio description',
|
||
|
type: 'boolean',
|
||
|
default: false
|
||
|
})
|
||
|
.option('config', {
|
||
|
alias: 'f',
|
||
|
describe: 'Path to JSON config file',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.option('saveConfig', {
|
||
|
describe: 'Save current configuration to specified JSON file',
|
||
|
type: 'string'
|
||
|
})
|
||
|
.help()
|
||
|
.alias('help', 'h')
|
||
|
.example('$0 video.mp4', 'Process a video with default settings')
|
||
|
.example('$0 video.mp4 --captureIntervalSeconds 10 --ttsVoice nova', 'Process with custom interval and voice')
|
||
|
.example('$0 video.mp4 --estimate', 'Only estimate the processing cost')
|
||
|
.example('$0 video.mp4 --config myconfig.json', 'Use settings from a config file')
|
||
|
.example('$0 video.mp4 --saveConfig myconfig.json', 'Save current settings to a config file')
|
||
|
.argv;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Main function to process a video file and generate audio descriptions
|
||
|
* @param {string} videoFilePath - Path to the input video file
|
||
|
* @param {object} options - Optional configuration overrides
|
||
|
*/
|
||
|
async function generateAudioDescription(videoFilePath, options = {}) {
|
||
|
// Merge provided options with defaults
|
||
|
const settings = { ...defaultConfig, ...options };
|
||
|
|
||
|
// Ensure temporary and output directories exist
|
||
|
if (!fs.existsSync(settings.tempDir)) {
|
||
|
fs.mkdirSync(settings.tempDir, { recursive: true });
|
||
|
}
|
||
|
if (!fs.existsSync(settings.outputDir)) {
|
||
|
fs.mkdirSync(settings.outputDir, { recursive: true });
|
||
|
}
|
||
|
|
||
|
// Get video duration
|
||
|
const videoDuration = getVideoDuration(videoFilePath);
|
||
|
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||
|
console.log(`Video duration: ${videoDuration} seconds`);
|
||
|
|
||
|
// If batchTimeMode is enabled, use the new approach
|
||
|
if (settings.batchTimeMode) {
|
||
|
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings);
|
||
|
}
|
||
|
// Calculate the number of frames to capture
|
||
|
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||
|
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||
|
|
||
|
// Context window to store previous frames
|
||
|
const frameContext = [];
|
||
|
|
||
|
// Array to store audio segment information
|
||
|
const audioSegments = [];
|
||
|
|
||
|
// Process each frame
|
||
|
for (let i = 0; i < totalFrames; i++) {
|
||
|
const timePosition = i * settings.captureIntervalSeconds;
|
||
|
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
|
||
|
|
||
|
// Capture frame at current time position
|
||
|
captureVideoFrame(videoFilePath, timePosition, frameFilePath);
|
||
|
console.log(`Captured frame at ${timePosition} seconds`);
|
||
|
|
||
|
// Add current frame to context
|
||
|
const currentFrame = {
|
||
|
index: i,
|
||
|
path: frameFilePath,
|
||
|
timePosition
|
||
|
};
|
||
|
|
||
|
frameContext.push(currentFrame);
|
||
|
|
||
|
// Keep context window at specified size
|
||
|
if (frameContext.length > settings.contextWindowSize) {
|
||
|
frameContext.shift();
|
||
|
}
|
||
|
|
||
|
// Generate description
|
||
|
let description;
|
||
|
if (frameContext.length === 1) {
|
||
|
// First frame - just describe what's in it
|
||
|
description = await describeFrame(frameFilePath, settings.defaultPrompt);
|
||
|
} else {
|
||
|
// Compare with previous frame
|
||
|
const previousFrame = frameContext[frameContext.length - 2];
|
||
|
description = await describeFrameChange(previousFrame.path, frameFilePath, settings.changePrompt);
|
||
|
}
|
||
|
|
||
|
console.log(`Description: ${description}`);
|
||
|
|
||
|
// Generate speech from description
|
||
|
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
|
||
|
await textToSpeech(description, audioFilePath, settings.ttsModel, settings.ttsVoice, settings.ttsSpeedFactor || 2);
|
||
|
|
||
|
// Store segment information
|
||
|
audioSegments.push({
|
||
|
audioFile: audioFilePath,
|
||
|
startTime: timePosition,
|
||
|
description
|
||
|
});
|
||
|
}
|
||
|
|
||
|
// Combine audio segments into final audio description track
|
||
|
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
|
||
|
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||
|
|
||
|
// Clean up temporary files if desired
|
||
|
// cleanupTempFiles(settings.tempDir);
|
||
|
|
||
|
console.log(`\nAudio description generated: ${outputAudioPath}`);
|
||
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||
|
printStats(stats, settings);
|
||
|
|
||
|
return {
|
||
|
videoFile: videoFilePath,
|
||
|
audioDescriptionFile: outputAudioPath
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the duration of a video file in seconds
|
||
|
* @param {string} videoFilePath - Path to the video file
|
||
|
* @returns {number} Duration in seconds
|
||
|
*/
|
||
|
function getVideoDuration(videoFilePath) {
|
||
|
const result = execSync(`ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${videoFilePath}"`);
|
||
|
return parseFloat(result.toString());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Capture a frame from a video at a specific time position
|
||
|
* @param {string} videoFilePath - Path to the video file
|
||
|
* @param {number} timePosition - Time position in seconds
|
||
|
* @param {string} outputPath - Output path for the captured frame
|
||
|
* @param {boolean} [lowQuality=false] - If true, save screenshot in 360p resolution
|
||
|
*/
|
||
|
function captureVideoFrame(videoFilePath, timePosition, outputPath, lowQuality = true) {
|
||
|
let command = `ffmpeg -v error -ss ${timePosition} -i "${videoFilePath}" -vframes 1 -q:v 2`;
|
||
|
|
||
|
// Add resolution scaling for low quality option
|
||
|
if (lowQuality) {
|
||
|
command += ' -vf scale=-1:360'; // Scale to 360p height while maintaining aspect ratio
|
||
|
}
|
||
|
|
||
|
command += ` "${outputPath}" -y`;
|
||
|
|
||
|
execSync(command);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Describe a single video frame using AI
|
||
|
* @param {string} framePath - Path to the frame image
|
||
|
* @param {string} prompt - Prompt for the AI
|
||
|
* @returns {string} Description of the frame
|
||
|
*/
|
||
|
async function describeFrame(framePath, prompt) {
|
||
|
try {
|
||
|
const imageData = fs.readFileSync(framePath);
|
||
|
const base64Image = imageData.toString('base64');
|
||
|
|
||
|
const response = await openai.chat.completions.create({
|
||
|
model: "gpt-4o",
|
||
|
messages: [
|
||
|
{
|
||
|
role: "user",
|
||
|
content: [
|
||
|
{ type: "text", text: prompt },
|
||
|
{
|
||
|
type: "image_url",
|
||
|
image_url: {
|
||
|
url: `data:image/jpeg;base64,${base64Image}`
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
max_tokens: 300
|
||
|
});
|
||
|
stats.totalVisionInputCost += response.usage.prompt_tokens;
|
||
|
stats.totalVisionOutputCost += response.usage.completion_tokens;
|
||
|
stats.totalCost += response.usage.total_tokens;
|
||
|
|
||
|
|
||
|
return response.choices[0].message.content.trim();
|
||
|
} catch (error) {
|
||
|
console.error("Error describing frame:", error);
|
||
|
return "Unable to describe this frame.";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Describe changes between two frames using AI
|
||
|
* @param {string} previousFramePath - Path to the previous frame
|
||
|
* @param {string} currentFramePath - Path to the current frame
|
||
|
* @param {string} prompt - Prompt for the AI
|
||
|
* @returns {string} Description of changes between frames
|
||
|
*/
|
||
|
async function describeFrameChange(previousFramePath, currentFramePath, prompt) {
|
||
|
try {
|
||
|
const previousImageData = fs.readFileSync(previousFramePath);
|
||
|
const currentImageData = fs.readFileSync(currentFramePath);
|
||
|
|
||
|
const previousBase64 = previousImageData.toString('base64');
|
||
|
const currentBase64 = currentImageData.toString('base64');
|
||
|
|
||
|
const response = await openai.chat.completions.create({
|
||
|
model: "gpt-4o",
|
||
|
messages: [
|
||
|
{
|
||
|
role: "user",
|
||
|
content: [
|
||
|
{ type: "text", text: prompt },
|
||
|
{
|
||
|
type: "image_url",
|
||
|
image_url: {
|
||
|
url: `data:image/jpeg;base64,${previousBase64}`
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
type: "image_url",
|
||
|
image_url: {
|
||
|
url: `data:image/jpeg;base64,${currentBase64}`
|
||
|
}
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
max_tokens: 300
|
||
|
});
|
||
|
stats.totalVisionInputCost += response.usage.prompt_tokens;
|
||
|
stats.totalVisionOutputCost += response.usage.completion_tokens;
|
||
|
stats.totalCost += response.usage.total_tokens;
|
||
|
|
||
|
return response.choices[0].message.content.trim();
|
||
|
} catch (error) {
|
||
|
console.error("Error describing frame change:", error);
|
||
|
return "Unable to describe changes between frames.";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Generate audio description using the new "batch time" mode.
|
||
|
* @param {string} videoFilePath - Path to the input video file
|
||
|
* @param {number} videoDuration - Duration of the video in seconds
|
||
|
* @param {object} settings - The merged config and user options
|
||
|
*/
|
||
|
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings) {
|
||
|
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
|
||
|
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
|
||
|
|
||
|
// We'll hold the last batch's frames or last batch's description for context
|
||
|
let lastBatchContext = [];
|
||
|
|
||
|
const audioSegments = [];
|
||
|
|
||
|
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
||
|
const batchStart = batchIndex * settings.batchWindowDuration;
|
||
|
const batchEnd = batchStart + settings.batchWindowDuration;
|
||
|
if (batchEnd > videoDuration) break; // Safety check
|
||
|
|
||
|
// Capture frames for this batch
|
||
|
const framePaths = [];
|
||
|
for (let i = 0; i < settings.framesInBatch; i++) {
|
||
|
const t = batchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
|
||
|
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
|
||
|
captureVideoFrame(videoFilePath, t, frameFilePath);
|
||
|
framePaths.push(frameFilePath);
|
||
|
}
|
||
|
|
||
|
// Use AI to describe this batch of frames, possibly providing some context
|
||
|
let description = await describeBatchOfFrames(
|
||
|
framePaths,
|
||
|
lastBatchContext,
|
||
|
settings.batchPrompt
|
||
|
);
|
||
|
|
||
|
console.log(`Batch #${batchIndex} description:\n${description}\n`);
|
||
|
|
||
|
// Convert description to TTS
|
||
|
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
|
||
|
await textToSpeech(
|
||
|
description,
|
||
|
audioFilePath,
|
||
|
settings.ttsModel,
|
||
|
settings.ttsVoice,
|
||
|
settings.ttsSpeedFactor
|
||
|
);
|
||
|
|
||
|
// Store segment info. We'll align the entire description at the start of the batch
|
||
|
audioSegments.push({
|
||
|
audioFile: audioFilePath,
|
||
|
startTime: batchStart,
|
||
|
description
|
||
|
});
|
||
|
|
||
|
// Update lastBatchContext so the next batch can keep track of what's previously seen
|
||
|
lastBatchContext = {
|
||
|
lastDescription: description,
|
||
|
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
|
||
|
};
|
||
|
}
|
||
|
|
||
|
// Combine all the audio segments into one track
|
||
|
const outputAudioPath = path.join(
|
||
|
settings.outputDir,
|
||
|
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
|
||
|
);
|
||
|
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||
|
|
||
|
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
|
||
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||
|
printStats(stats, settings);
|
||
|
|
||
|
return {
|
||
|
videoFile: videoFilePath,
|
||
|
audioDescriptionFile: outputAudioPath
|
||
|
};
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Describe a batch of frames using AI, optionally providing context (last batch's data).
|
||
|
* @param {string[]} framePaths - Array of file paths for this batch's frames
|
||
|
* @param {object} lastBatchContext - Data from the previous batch (e.g., text or images)
|
||
|
* @param {string} batchPrompt - Prompt text for describing a batch
|
||
|
* @returns {string} AI-generated batch description
|
||
|
*/
|
||
|
async function describeBatchOfFrames(framePaths, lastBatchContext, batchPrompt) {
|
||
|
try {
|
||
|
// Convert images to base64
|
||
|
const imagesBase64 = framePaths.map(fp => {
|
||
|
const imageData = fs.readFileSync(fp);
|
||
|
return imageData.toString('base64');
|
||
|
});
|
||
|
|
||
|
// Build the messages array for the chat completion
|
||
|
// Start with a system or user message that includes the prompt
|
||
|
const messages = [
|
||
|
{
|
||
|
role: "user",
|
||
|
content: [
|
||
|
{ type: "text", text: batchPrompt }
|
||
|
]
|
||
|
}
|
||
|
];
|
||
|
|
||
|
// If we have some text context from the last batch, inject that as well
|
||
|
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||
|
// E.g., add it as a "system" or "user" message depending on your style
|
||
|
messages.unshift({
|
||
|
role: "system",
|
||
|
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||
|
});
|
||
|
}
|
||
|
|
||
|
// Append each image in the new batch
|
||
|
imagesBase64.forEach(base64 => {
|
||
|
messages[messages.length - 1].content.push({
|
||
|
type: "image_url",
|
||
|
image_url: {
|
||
|
url: `data:image/jpeg;base64,${base64}`
|
||
|
}
|
||
|
});
|
||
|
});
|
||
|
|
||
|
const response = await openai.chat.completions.create({
|
||
|
model: "gpt-4o",
|
||
|
messages,
|
||
|
max_tokens: 300
|
||
|
});
|
||
|
stats.totalVisionInputCost += response.usage.prompt_tokens;
|
||
|
stats.totalVisionOutputCost += response.usage.completion_tokens;
|
||
|
stats.totalCost += response.usage.total_tokens;
|
||
|
|
||
|
return response.choices[0].message.content.trim();
|
||
|
} catch (error) {
|
||
|
console.error("Error describing batch of frames:", error);
|
||
|
return "Unable to describe this batch of frames.";
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Convert text to speech using AI with speed adjustment
|
||
|
* @param {string} text - Text to convert to speech
|
||
|
* @param {string} outputPath - Output path for the audio file
|
||
|
* @param {string} model - TTS model to use
|
||
|
* @param {string} voice - Voice to use for TTS
|
||
|
* @param {number} speedFactor - Speed multiplier (1.0 = normal speed, 2.0 = double speed)
|
||
|
*/
|
||
|
async function textToSpeech(text, outputPath, model, voice, speedFactor = 1.3) {
|
||
|
try {
|
||
|
// Generate the initial TTS output
|
||
|
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||
|
|
||
|
const mp3 = await openai.audio.speech.create({
|
||
|
model: model,
|
||
|
voice: voice,
|
||
|
input: text
|
||
|
});
|
||
|
|
||
|
stats.totalTTSCost += text.length;
|
||
|
|
||
|
const buffer = Buffer.from(await mp3.arrayBuffer());
|
||
|
fs.writeFileSync(tempOutputPath, buffer);
|
||
|
|
||
|
// Speed up the audio using FFmpeg
|
||
|
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||
|
|
||
|
// Clean up temporary file
|
||
|
fs.unlinkSync(tempOutputPath);
|
||
|
|
||
|
// Get audio duration to make sure we have accurate timing
|
||
|
const audioDuration = getAudioDuration(outputPath);
|
||
|
return audioDuration;
|
||
|
} catch (error) {
|
||
|
console.error("Error generating speech:", error);
|
||
|
// Create a silent audio file if TTS fails
|
||
|
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||
|
return 1; // Return a default duration of 1 second
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Get the duration of an audio file in seconds
|
||
|
* @param {string} audioFilePath - Path to the audio file
|
||
|
* @returns {number} Duration in seconds
|
||
|
*/
|
||
|
function getAudioDuration(audioFilePath) {
|
||
|
const result = execSync(`ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${audioFilePath}"`);
|
||
|
return parseFloat(result.toString());
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Combine audio segments into a single audio track using lossless intermediates
|
||
|
* @param {Array} segments - Array of audio segment information
|
||
|
* @param {string} outputPath - Output path for the combined audio
|
||
|
* @param {number} videoDuration - Duration of the video in seconds
|
||
|
* @param {object} settings - Configuration settings
|
||
|
*/
|
||
|
function combineAudioSegments(segments, outputPath, videoDuration, settings) {
|
||
|
console.log(`Combining ${segments.length} audio segments using lossless intermediates...`);
|
||
|
|
||
|
try {
|
||
|
// Create a silent base track with the full video duration (always WAV)
|
||
|
const silentBasePath = path.join(settings.tempDir, 'silent_base.wav');
|
||
|
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${silentBasePath}" -y`);
|
||
|
|
||
|
// Sort segments by start time to process them in order
|
||
|
const sortedSegments = [...segments].sort((a, b) => a.startTime - b.startTime);
|
||
|
|
||
|
// Process one segment at a time, building up the audio file
|
||
|
let currentAudioPath = silentBasePath;
|
||
|
|
||
|
for (let i = 0; i < sortedSegments.length; i++) {
|
||
|
const segment = sortedSegments[i];
|
||
|
const outputFile = path.join(settings.tempDir, `segment_${i}_output.wav`);
|
||
|
|
||
|
// Convert the segment to a standard WAV format first to avoid compatibility issues
|
||
|
// and ensure we're always working with lossless audio
|
||
|
const standardizedSegment = path.join(settings.tempDir, `segment_${i}_std.wav`);
|
||
|
execSync(`ffmpeg -v error -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le "${standardizedSegment}" -y`);
|
||
|
|
||
|
// Calculate the position for this segment
|
||
|
const timestamp = segment.startTime.toFixed(3);
|
||
|
|
||
|
// Create a filter script for this segment
|
||
|
const filterPath = path.join(settings.tempDir, `filter_${i}.txt`);
|
||
|
|
||
|
// Use a filter that preserves the audio quality and positions correctly
|
||
|
const filterContent =
|
||
|
`[1:a]adelay=${Math.round(segment.startTime * 1000)}|${Math.round(segment.startTime * 1000)}[delayed];\n` +
|
||
|
`[0:a][delayed]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[out]`;
|
||
|
|
||
|
fs.writeFileSync(filterPath, filterContent);
|
||
|
|
||
|
// Execute FFmpeg with the filter script
|
||
|
execSync(`ffmpeg -v error -i "${currentAudioPath}" -i "${standardizedSegment}" -filter_complex_script "${filterPath}" -map "[out]" -c:a pcm_s16le "${outputFile}" -y`);
|
||
|
|
||
|
// Clean up previous file if not the original
|
||
|
if (currentAudioPath !== silentBasePath) {
|
||
|
fs.unlinkSync(currentAudioPath);
|
||
|
}
|
||
|
|
||
|
// Clean up standardized segment and filter
|
||
|
fs.unlinkSync(standardizedSegment);
|
||
|
fs.unlinkSync(filterPath);
|
||
|
|
||
|
// Update current audio path for next iteration
|
||
|
currentAudioPath = outputFile;
|
||
|
|
||
|
console.log(`Added segment ${i + 1}/${sortedSegments.length} at position ${timestamp}s`);
|
||
|
}
|
||
|
|
||
|
// Only at the very end, convert to the requested output format
|
||
|
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||
|
console.log(`Converting final lossless WAV to MP3: ${outputPath}`);
|
||
|
execSync(`ffmpeg -v error -i "${currentAudioPath}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||
|
} else {
|
||
|
fs.copyFileSync(currentAudioPath, outputPath);
|
||
|
}
|
||
|
|
||
|
console.log(`Audio description track created: ${outputPath}`);
|
||
|
|
||
|
// Clean up the last temp file
|
||
|
if (currentAudioPath !== silentBasePath) {
|
||
|
fs.unlinkSync(currentAudioPath);
|
||
|
}
|
||
|
|
||
|
if (fs.existsSync(silentBasePath)) {
|
||
|
fs.unlinkSync(silentBasePath);
|
||
|
}
|
||
|
|
||
|
return outputPath;
|
||
|
|
||
|
} catch (error) {
|
||
|
console.error("Error in lossless audio combination:", error.message);
|
||
|
|
||
|
try {
|
||
|
console.log("Trying alternative approach with single-step filter...");
|
||
|
|
||
|
// Create a silent base track (always WAV)
|
||
|
const silentBasePath = path.join(settings.tempDir, 'silent_base.wav');
|
||
|
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${silentBasePath}" -y`);
|
||
|
|
||
|
// Create a complex filter to overlay all audio files at their specific timestamps
|
||
|
const filterScriptPath = path.join(settings.tempDir, 'overlay_filter.txt');
|
||
|
let filterScript = '';
|
||
|
|
||
|
// Sort segments by start time
|
||
|
const sortedSegments = [...segments].sort((a, b) => a.startTime - b.startTime);
|
||
|
|
||
|
// Standardize all segments to WAV first
|
||
|
const standardizedSegments = [];
|
||
|
for (let i = 0; i < sortedSegments.length; i++) {
|
||
|
const segment = sortedSegments[i];
|
||
|
const stdPath = path.join(settings.tempDir, `std_${i}.wav`);
|
||
|
execSync(`ffmpeg -v error -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le "${stdPath}" -y`);
|
||
|
standardizedSegments.push({
|
||
|
path: stdPath,
|
||
|
startTime: segment.startTime
|
||
|
});
|
||
|
}
|
||
|
|
||
|
// Build the FFmpeg command with all standardized inputs
|
||
|
let ffmpegCmd = `ffmpeg -v error -i "${silentBasePath}" `;
|
||
|
|
||
|
// Add all standardized segments as inputs and create the filter script
|
||
|
for (let i = 0; i < standardizedSegments.length; i++) {
|
||
|
// Add as input
|
||
|
ffmpegCmd += `-i "${standardizedSegments[i].path}" `;
|
||
|
|
||
|
// Add to filter script - the input index starts at 1 because 0 is the silent base
|
||
|
const inputIndex = i + 1;
|
||
|
const delay = Math.round(standardizedSegments[i].startTime * 1000);
|
||
|
|
||
|
// Add this input to filter script with proper delay
|
||
|
filterScript += `[${inputIndex}:a]adelay=${delay}|${delay}[a${i}];\n`;
|
||
|
}
|
||
|
|
||
|
// Complete the filter script to merge all streams
|
||
|
filterScript += '[0:a]'; // Start with base
|
||
|
for (let i = 0; i < standardizedSegments.length; i++) {
|
||
|
filterScript += `[a${i}]`;
|
||
|
}
|
||
|
// Use amix with normalize=0 to preserve volumes
|
||
|
filterScript += `amix=inputs=${standardizedSegments.length + 1}:normalize=0:duration=first[aout]`;
|
||
|
|
||
|
// Write the filter script
|
||
|
fs.writeFileSync(filterScriptPath, filterScript);
|
||
|
|
||
|
// Use an intermediate WAV for the output to maintain quality
|
||
|
const intermediatePath = path.join(settings.tempDir, 'intermediate_output.wav');
|
||
|
|
||
|
// Complete the FFmpeg command - always output to WAV first
|
||
|
ffmpegCmd += `-filter_complex_script "${filterScriptPath}" -map "[aout]" -c:a pcm_s16le "${intermediatePath}" -y`;
|
||
|
|
||
|
// Execute the command
|
||
|
execSync(ffmpegCmd);
|
||
|
|
||
|
// Convert to the requested format only at the end
|
||
|
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||
|
console.log(`Converting final audio to MP3...`);
|
||
|
execSync(`ffmpeg -v error -i "${intermediatePath}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||
|
} else {
|
||
|
fs.copyFileSync(intermediatePath, outputPath);
|
||
|
}
|
||
|
|
||
|
console.log(`Audio description track created with alternative method: ${outputPath}`);
|
||
|
|
||
|
// Clean up temp files
|
||
|
if (fs.existsSync(filterScriptPath)) {
|
||
|
fs.unlinkSync(filterScriptPath);
|
||
|
}
|
||
|
|
||
|
if (fs.existsSync(silentBasePath)) {
|
||
|
fs.unlinkSync(silentBasePath);
|
||
|
}
|
||
|
|
||
|
if (fs.existsSync(intermediatePath)) {
|
||
|
fs.unlinkSync(intermediatePath);
|
||
|
}
|
||
|
|
||
|
// Clean up standardized segments
|
||
|
standardizedSegments.forEach(seg => {
|
||
|
if (fs.existsSync(seg.path)) {
|
||
|
fs.unlinkSync(seg.path);
|
||
|
}
|
||
|
});
|
||
|
|
||
|
return outputPath;
|
||
|
|
||
|
} catch (secondError) {
|
||
|
console.error("Alternative approach failed:", secondError.message);
|
||
|
|
||
|
// Last resort: Generate a command file with the proper syntax
|
||
|
const cmdFilePath = outputPath.replace(/\.\w+$/, '_ffmpeg_cmd.sh');
|
||
|
let cmdContent = `#!/bin/bash\n\n# FFmpeg command to combine audio segments\n\n`;
|
||
|
|
||
|
// Add commands to convert all segments to WAV first
|
||
|
cmdContent += `# First convert all segments to standard WAV format\n`;
|
||
|
for (let i = 0; i < segments.length; i++) {
|
||
|
const segment = segments[i];
|
||
|
const stdPath = `"${settings.tempDir}/std_${i}.wav"`;
|
||
|
cmdContent += `ffmpeg -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le ${stdPath} -y\n`;
|
||
|
}
|
||
|
|
||
|
// Create silent base
|
||
|
cmdContent += `\n# Create silent base track\n`;
|
||
|
cmdContent += `ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${settings.tempDir}/silent_base.wav" -y\n\n`;
|
||
|
|
||
|
// Create filter file
|
||
|
cmdContent += `# Create filter file\n`;
|
||
|
cmdContent += `cat > "${settings.tempDir}/filter.txt" << EOL\n`;
|
||
|
|
||
|
// Add delay filters for each segment
|
||
|
for (let i = 0; i < segments.length; i++) {
|
||
|
const segment = segments[i];
|
||
|
const delay = Math.round(segment.startTime * 1000);
|
||
|
cmdContent += `[${i + 1}:a]adelay=${delay}|${delay}[a${i}];\n`;
|
||
|
}
|
||
|
|
||
|
// Mix all streams
|
||
|
cmdContent += `[0:a]`;
|
||
|
for (let i = 0; i < segments.length; i++) {
|
||
|
cmdContent += `[a${i}]`;
|
||
|
}
|
||
|
cmdContent += `amix=inputs=${segments.length + 1}:normalize=0:duration=first[aout]\nEOL\n\n`;
|
||
|
|
||
|
// Final command
|
||
|
cmdContent += `# Run final FFmpeg command\n`;
|
||
|
cmdContent += `ffmpeg -i "${settings.tempDir}/silent_base.wav" `;
|
||
|
|
||
|
// Add all segments as inputs
|
||
|
for (let i = 0; i < segments.length; i++) {
|
||
|
cmdContent += `-i "${settings.tempDir}/std_${i}.wav" `;
|
||
|
}
|
||
|
|
||
|
// Complete command
|
||
|
cmdContent += `-filter_complex_script "${settings.tempDir}/filter.txt" -map "[aout]" `;
|
||
|
|
||
|
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||
|
cmdContent += `-c:a libmp3lame -q:a 2 `;
|
||
|
} else {
|
||
|
cmdContent += `-c:a pcm_s16le `;
|
||
|
}
|
||
|
|
||
|
cmdContent += `"${outputPath}" -y\n\n`;
|
||
|
|
||
|
// Add cleanup
|
||
|
cmdContent += `# Clean up temp files\n`;
|
||
|
cmdContent += `rm "${settings.tempDir}/silent_base.wav" "${settings.tempDir}/filter.txt"\n`;
|
||
|
for (let i = 0; i < segments.length; i++) {
|
||
|
cmdContent += `rm "${settings.tempDir}/std_${i}.wav"\n`;
|
||
|
}
|
||
|
|
||
|
// Make the file executable
|
||
|
fs.writeFileSync(cmdFilePath, cmdContent);
|
||
|
execSync(`chmod +x "${cmdFilePath}"`);
|
||
|
|
||
|
console.log(`\nCreated executable script with proper FFmpeg commands: ${cmdFilePath}`);
|
||
|
console.log(`Run this script to generate the audio file.`);
|
||
|
|
||
|
return {
|
||
|
commandFile: cmdFilePath
|
||
|
};
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Clean up temporary files
|
||
|
* @param {string} tempDir - Directory containing temporary files
|
||
|
*/
|
||
|
function cleanupTempFiles(tempDir) {
|
||
|
const files = fs.readdirSync(tempDir);
|
||
|
for (const file of files) {
|
||
|
fs.unlinkSync(path.join(tempDir, file));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Estimate the cost of generating audio descriptions for a video
|
||
|
* @param {string} videoFilePath - Path to the input video file
|
||
|
* @param {object} options - Optional configuration overrides
|
||
|
* @returns {object} Cost estimation breakdown
|
||
|
*/
|
||
|
/**
|
||
|
* Load configuration from a JSON file
|
||
|
* @param {string} filePath - Path to the configuration file
|
||
|
* @returns {object} Configuration object
|
||
|
*/
|
||
|
function loadConfigFromFile(filePath) {
|
||
|
try {
|
||
|
const configFile = fs.readFileSync(filePath, 'utf8');
|
||
|
const config = JSON.parse(configFile);
|
||
|
console.log(`Loaded configuration from ${filePath}`);
|
||
|
return config;
|
||
|
} catch (error) {
|
||
|
console.error(`Error loading config from ${filePath}:`, error.message);
|
||
|
process.exit(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Save configuration to a JSON file
|
||
|
* @param {string} filePath - Path to save the configuration file
|
||
|
* @param {object} config - Configuration object to save
|
||
|
*/
|
||
|
function saveConfigToFile(filePath, config) {
|
||
|
try {
|
||
|
// Filter out non-configuration properties
|
||
|
const configToSave = { ...config };
|
||
|
const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h'];
|
||
|
keysToExclude.forEach(key => delete configToSave[key]);
|
||
|
|
||
|
fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8');
|
||
|
console.log(`Configuration saved to ${filePath}`);
|
||
|
} catch (error) {
|
||
|
console.error(`Error saving config to ${filePath}:`, error.message);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Print out statistics
|
||
|
* @param {object} stats - Statistics object
|
||
|
* @param {object} settings - Configuration settings
|
||
|
*/
|
||
|
function printStats(stats, settings) {
|
||
|
// Pricing constants (as of March 2025, update as needed)
|
||
|
const pricing = {
|
||
|
// OpenAI pricing (per 1000 tokens)
|
||
|
gpt4o: {
|
||
|
input: 0.0025, // $0.0025 per 1K input tokens
|
||
|
output: 0.01 // $0.01 per 1K output tokens
|
||
|
},
|
||
|
// TTS pricing (per 1000 characters)
|
||
|
tts: {
|
||
|
'tts-1': 0.015, // $0.015 per 1K characters
|
||
|
'tts-1-hd': 0.030 // $0.030 per 1K characters
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// Calculate prices. The stats object contains amount of tokens.
|
||
|
stats.totalVisionInputCost = stats.totalVisionInputCost * pricing.gpt4o.input / 1000;
|
||
|
stats.totalVisionOutputCost = stats.totalVisionOutputCost * pricing.gpt4o.output / 1000;
|
||
|
stats.totalTTSCost = stats.totalTTSCost * pricing.tts[settings.ttsModel] / 1000;
|
||
|
stats.totalCost = stats.totalVisionInputCost + stats.totalVisionOutputCost + stats.totalTTSCost;
|
||
|
|
||
|
// Print out the stats
|
||
|
console.log('\n=== STATISTICS ===');
|
||
|
console.log(`Total vision input cost: ${stats.totalVisionInputCost.toFixed(4)}`);
|
||
|
console.log(`Total vision output cost: ${stats.totalVisionOutputCost.toFixed(4)}`);
|
||
|
console.log(`Total TTS cost: ${stats.totalTTSCost.toFixed(4)}`);
|
||
|
console.log(`Total cost: ${stats.totalCost.toFixed(4)}`);
|
||
|
}
|
||
|
|
||
|
async function estimateCost(videoFilePath, options = {}) {
|
||
|
// Merge provided options with defaults
|
||
|
const settings = { ...defaultConfig, ...options };
|
||
|
|
||
|
// Get video duration
|
||
|
const videoDuration = getVideoDuration(videoFilePath);
|
||
|
console.log(`Video duration: ${videoDuration} seconds`);
|
||
|
|
||
|
// Calculate the number of frames to capture
|
||
|
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||
|
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||
|
|
||
|
// Pricing constants (as of March 2025, update as needed)
|
||
|
const pricing = {
|
||
|
// OpenAI pricing (per 1000 tokens)
|
||
|
gpt4o: {
|
||
|
input: 0.01, // $0.01 per 1K input tokens
|
||
|
output: 0.03 // $0.03 per 1K output tokens
|
||
|
},
|
||
|
// TTS pricing (per 1000 characters)
|
||
|
tts: {
|
||
|
'tts-1': 0.015, // $0.015 per 1K characters
|
||
|
'tts-1-hd': 0.030 // $0.030 per 1K characters
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// Check if the TTS model exists in our pricing table
|
||
|
if (!pricing.tts[settings.ttsModel]) {
|
||
|
console.warn(`Warning: TTS model "${settings.ttsModel}" not found in pricing table. Using tts-1-hd pricing.`);
|
||
|
settings.ttsModel = 'tts-1-hd'; // Fallback to a known model
|
||
|
}
|
||
|
|
||
|
// Estimated token counts
|
||
|
const estimatedVisionInputTokens = 1000; // Base tokens for the vision input (approx. for a single image)
|
||
|
const estimatedPromptTokens = 50; // Tokens for the prompt text
|
||
|
const estimatedOutputTokensPerFrame = 50; // Average tokens for description output
|
||
|
|
||
|
// Estimated character counts for TTS
|
||
|
const estimatedCharsPerDescription = 200; // Average characters per description
|
||
|
|
||
|
// Calculate estimated costs
|
||
|
const firstFrameCost = {
|
||
|
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * pricing.gpt4o.input / 1000,
|
||
|
visionOutput: estimatedOutputTokensPerFrame * pricing.gpt4o.output / 1000,
|
||
|
tts: estimatedCharsPerDescription * pricing.tts[settings.ttsModel] / 1000
|
||
|
};
|
||
|
|
||
|
const subsequentFrameCost = {
|
||
|
// For subsequent frames, we need two images (previous + current)
|
||
|
visionInput: (estimatedVisionInputTokens * 2 + estimatedPromptTokens) * pricing.gpt4o.input / 1000,
|
||
|
visionOutput: estimatedOutputTokensPerFrame * pricing.gpt4o.output / 1000,
|
||
|
tts: estimatedCharsPerDescription * pricing.tts[settings.ttsModel] / 1000
|
||
|
};
|
||
|
|
||
|
// Calculate total costs
|
||
|
const totalVisionInputCost =
|
||
|
firstFrameCost.visionInput +
|
||
|
(totalFrames - 1) * subsequentFrameCost.visionInput;
|
||
|
|
||
|
const totalVisionOutputCost =
|
||
|
firstFrameCost.visionOutput +
|
||
|
(totalFrames - 1) * subsequentFrameCost.visionOutput;
|
||
|
|
||
|
const totalTTSCost =
|
||
|
firstFrameCost.tts +
|
||
|
(totalFrames - 1) * subsequentFrameCost.tts;
|
||
|
|
||
|
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||
|
|
||
|
// Create cost breakdown
|
||
|
const costBreakdown = {
|
||
|
videoInfo: {
|
||
|
duration: videoDuration,
|
||
|
totalFrames: totalFrames,
|
||
|
captureInterval: settings.captureIntervalSeconds
|
||
|
},
|
||
|
apiCosts: {
|
||
|
visionInput: totalVisionInputCost.toFixed(4),
|
||
|
visionOutput: totalVisionOutputCost.toFixed(4),
|
||
|
tts: totalTTSCost.toFixed(4),
|
||
|
total: totalCost.toFixed(4)
|
||
|
},
|
||
|
estimates: {
|
||
|
totalAPICallsToOpenAI: totalFrames * 2, // Vision + TTS for each frame
|
||
|
estimatedProcessingTimeMinutes: (totalFrames * 3) / 60 // rough estimate, 3 seconds per frame
|
||
|
}
|
||
|
};
|
||
|
|
||
|
return costBreakdown;
|
||
|
}
|
||
|
|
||
|
// Main execution
|
||
|
if (require.main === module) {
|
||
|
// Parse command line arguments
|
||
|
const argv = parseCommandLineArgs();
|
||
|
|
||
|
// Start with default config
|
||
|
let config = { ...defaultConfig };
|
||
|
|
||
|
// If a config file is specified, load it
|
||
|
if (argv.config) {
|
||
|
const fileConfig = loadConfigFromFile(argv.config);
|
||
|
config = { ...config, ...fileConfig };
|
||
|
}
|
||
|
|
||
|
// Override with any command line arguments
|
||
|
Object.keys(argv).forEach(key => {
|
||
|
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||
|
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||
|
argv[key] !== undefined) {
|
||
|
config[key] = argv[key];
|
||
|
}
|
||
|
});
|
||
|
|
||
|
// Save configuration if requested
|
||
|
if (argv.saveConfig) {
|
||
|
saveConfigToFile(argv.saveConfig, config);
|
||
|
}
|
||
|
|
||
|
// Check if a video file is provided
|
||
|
if (argv._.length < 1) {
|
||
|
console.error('Error: No video file specified');
|
||
|
console.log('Usage: node script.js <video_file_path> [options]');
|
||
|
console.log('Use --help for more information');
|
||
|
process.exit(1);
|
||
|
}
|
||
|
|
||
|
const videoFilePath = argv._[0];
|
||
|
|
||
|
// Run estimation or full processing
|
||
|
if (argv.estimate) {
|
||
|
estimateCost(videoFilePath, config)
|
||
|
.then(costBreakdown => {
|
||
|
console.log('\n=== COST ESTIMATION ===');
|
||
|
console.log(JSON.stringify(costBreakdown, null, 2));
|
||
|
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
||
|
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
||
|
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
||
|
})
|
||
|
.catch(err => {
|
||
|
console.error('Error estimating costs:', err);
|
||
|
});
|
||
|
} else {
|
||
|
// Run the full generator
|
||
|
generateAudioDescription(videoFilePath, config).catch(err => {
|
||
|
console.error('Error generating audio description:', err);
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Export functions for use as a module
|
||
|
module.exports = {
|
||
|
generateAudioDescription,
|
||
|
estimateCost,
|
||
|
config: defaultConfig
|
||
|
};
|