295 lines
15 KiB
JavaScript
295 lines
15 KiB
JavaScript
|
|
"use strict";
|
||
|
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
||
|
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
||
|
|
};
|
||
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
||
|
|
exports.generateAudioDescriptionFromOptions = generateAudioDescriptionFromOptions;
|
||
|
|
exports.generateAudioDescription = generateAudioDescription;
|
||
|
|
const fs_1 = __importDefault(require("fs"));
|
||
|
|
const path_1 = __importDefault(require("path"));
|
||
|
|
const config_1 = require("../config/config");
|
||
|
|
const stats_1 = require("../config/stats");
|
||
|
|
const visionProviderFactory_1 = require("../providers/vision/visionProviderFactory");
|
||
|
|
const ttsProviderFactory_1 = require("../providers/tts/ttsProviderFactory");
|
||
|
|
const mediaUtils_1 = require("./mediaUtils");
|
||
|
|
/**
|
||
|
|
* High-level API: Generate audio description for a video with just options.
|
||
|
|
* This internally creates providers and stats so callers don't need to.
|
||
|
|
*
|
||
|
|
* @param videoFilePath - Path to the input video file
|
||
|
|
* @param options - Optional configuration overrides
|
||
|
|
* @returns Result of the operation
|
||
|
|
*/
|
||
|
|
async function generateAudioDescriptionFromOptions(videoFilePath, options = {}, processingOptions = {}) {
|
||
|
|
const config = { ...(0, config_1.getDefaultConfig)(), ...options };
|
||
|
|
if (!fs_1.default.existsSync(config.tempDir)) {
|
||
|
|
fs_1.default.mkdirSync(config.tempDir, { recursive: true });
|
||
|
|
}
|
||
|
|
if (!fs_1.default.existsSync(config.outputDir)) {
|
||
|
|
fs_1.default.mkdirSync(config.outputDir, { recursive: true });
|
||
|
|
}
|
||
|
|
const visionProvider = visionProviderFactory_1.VisionProviderFactory.getProvider(config);
|
||
|
|
const ttsProvider = ttsProviderFactory_1.TTSProviderFactory.getProvider(config);
|
||
|
|
const stats = (0, stats_1.createStats)();
|
||
|
|
return generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats, processingOptions);
|
||
|
|
}
|
||
|
|
/**
|
||
|
|
* Generate audio description for a video (low-level API requiring pre-initialized providers).
|
||
|
|
* @param videoFilePath - Path to the input video file
|
||
|
|
* @param visionProvider - Vision provider instance
|
||
|
|
* @param ttsProvider - TTS provider instance
|
||
|
|
* @param options - Optional configuration overrides
|
||
|
|
* @param stats - Stats object for tracking
|
||
|
|
* @returns Result of the operation
|
||
|
|
*/
|
||
|
|
async function generateAudioDescription(videoFilePath, visionProvider, ttsProvider, options = {}, stats, processingOptions = {}) {
|
||
|
|
// Merge provided options with defaults
|
||
|
|
const settings = { ...options };
|
||
|
|
// Ensure temporary and output directories exist
|
||
|
|
if (!fs_1.default.existsSync(settings.tempDir)) {
|
||
|
|
fs_1.default.mkdirSync(settings.tempDir, { recursive: true });
|
||
|
|
}
|
||
|
|
if (!fs_1.default.existsSync(settings.outputDir)) {
|
||
|
|
fs_1.default.mkdirSync(settings.outputDir, { recursive: true });
|
||
|
|
}
|
||
|
|
// Get video duration
|
||
|
|
const videoDuration = (0, mediaUtils_1.getVideoDuration)(videoFilePath);
|
||
|
|
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||
|
|
console.log(`Video duration: ${videoDuration} seconds`);
|
||
|
|
// If batchTimeMode is enabled, use the new approach
|
||
|
|
if (settings.batchTimeMode) {
|
||
|
|
return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions);
|
||
|
|
}
|
||
|
|
// Calculate the number of frames to capture
|
||
|
|
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||
|
|
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||
|
|
// Context window to store previous frames
|
||
|
|
const frameContext = [];
|
||
|
|
// Array to store audio segment information - preload with existing segments if resuming
|
||
|
|
const audioSegments = processingOptions.existingSegments
|
||
|
|
? [...processingOptions.existingSegments]
|
||
|
|
: [];
|
||
|
|
// Track our current time position (will be adjusted for audio overlap)
|
||
|
|
let currentTimePosition = processingOptions.currentTimePosition || 0;
|
||
|
|
// Start from given index if resuming
|
||
|
|
const startIndex = processingOptions.startIndex || 0;
|
||
|
|
// Track drift from the original schedule
|
||
|
|
let timelineDrift = 0;
|
||
|
|
const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
|
||
|
|
// Process each frame
|
||
|
|
for (let i = startIndex; i < totalFrames; i++) {
|
||
|
|
// Calculate the ideal time position based on the original schedule
|
||
|
|
const idealTimePosition = i * settings.captureIntervalSeconds;
|
||
|
|
// Use the adjusted time position that accounts for previous audio durations
|
||
|
|
const timePosition = currentTimePosition;
|
||
|
|
// Calculate drift from the original schedule
|
||
|
|
timelineDrift = timePosition - idealTimePosition;
|
||
|
|
// Log if drift is becoming significant
|
||
|
|
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||
|
|
console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
|
||
|
|
}
|
||
|
|
const frameFilePath = path_1.default.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
|
||
|
|
// Capture frame at current time position (use the ideal time to capture the frame)
|
||
|
|
(0, mediaUtils_1.captureVideoFrame)(videoFilePath, idealTimePosition, frameFilePath);
|
||
|
|
console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
|
||
|
|
// Add current frame to context
|
||
|
|
const currentFrame = {
|
||
|
|
index: i,
|
||
|
|
path: frameFilePath,
|
||
|
|
timePosition
|
||
|
|
};
|
||
|
|
frameContext.push(currentFrame);
|
||
|
|
// Keep context window at specified size
|
||
|
|
if (frameContext.length > settings.contextWindowSize) {
|
||
|
|
frameContext.shift();
|
||
|
|
}
|
||
|
|
// Generate description
|
||
|
|
let description;
|
||
|
|
let usageStats;
|
||
|
|
if (frameContext.length === 1) {
|
||
|
|
// First frame - just describe what's in it
|
||
|
|
const result = await visionProvider.describeImage(frameFilePath, settings.defaultPrompt);
|
||
|
|
description = result.description;
|
||
|
|
usageStats = result.usage;
|
||
|
|
}
|
||
|
|
else {
|
||
|
|
// Compare with previous frame
|
||
|
|
const previousFrame = frameContext[frameContext.length - 2];
|
||
|
|
const result = await visionProvider.compareImages(previousFrame.path, frameFilePath, settings.changePrompt);
|
||
|
|
description = result.description;
|
||
|
|
usageStats = result.usage;
|
||
|
|
}
|
||
|
|
// Update stats
|
||
|
|
stats.totalVisionInputCost += usageStats.inputTokens;
|
||
|
|
stats.totalVisionOutputCost += usageStats.outputTokens;
|
||
|
|
stats.totalCost += usageStats.totalTokens;
|
||
|
|
console.log(`Description: ${description}`);
|
||
|
|
// Generate speech from description
|
||
|
|
const audioFilePath = path_1.default.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
|
||
|
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||
|
|
voice: settings.ttsVoice,
|
||
|
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||
|
|
speedFactor: settings.ttsSpeedFactor,
|
||
|
|
instructions: settings.ttsInstructions
|
||
|
|
});
|
||
|
|
const audioDuration = ttsResult.duration;
|
||
|
|
stats.totalTTSCost += ttsResult.cost;
|
||
|
|
console.log(`Audio duration: ${audioDuration} seconds`);
|
||
|
|
// Store segment information
|
||
|
|
const segment = {
|
||
|
|
audioFile: audioFilePath,
|
||
|
|
startTime: timePosition,
|
||
|
|
duration: audioDuration,
|
||
|
|
description
|
||
|
|
};
|
||
|
|
audioSegments.push(segment);
|
||
|
|
// Notify progress callback
|
||
|
|
if (processingOptions.onProgress) {
|
||
|
|
processingOptions.onProgress({
|
||
|
|
type: 'frame',
|
||
|
|
index: i,
|
||
|
|
total: totalFrames,
|
||
|
|
segment
|
||
|
|
});
|
||
|
|
}
|
||
|
|
// Update the time position for the next iteration
|
||
|
|
// Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
|
||
|
|
const bufferTime = 0.25;
|
||
|
|
currentTimePosition = timePosition + audioDuration + bufferTime;
|
||
|
|
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||
|
|
const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
|
||
|
|
if (currentTimePosition < nextIdealPosition) {
|
||
|
|
console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
|
||
|
|
currentTimePosition = nextIdealPosition;
|
||
|
|
timelineDrift = 0; // Reset drift since we've caught up
|
||
|
|
}
|
||
|
|
}
|
||
|
|
// Combine audio segments into final audio description track
|
||
|
|
const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description.mp3`);
|
||
|
|
(0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings);
|
||
|
|
// Clean up temporary files if desired
|
||
|
|
// cleanupTempFiles(settings.tempDir);
|
||
|
|
console.log(`\nAudio description generated: ${outputAudioPath}`);
|
||
|
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||
|
|
(0, stats_1.printStats)(stats, settings);
|
||
|
|
return {
|
||
|
|
videoFile: videoFilePath,
|
||
|
|
audioDescriptionFile: outputAudioPath,
|
||
|
|
segments: audioSegments
|
||
|
|
};
|
||
|
|
}
|
||
|
|
/**
|
||
|
|
* Generate audio description using the "batch time" mode with overlap prevention.
|
||
|
|
* @param videoFilePath - Path to the input video file
|
||
|
|
* @param videoDuration - Duration of the video in seconds
|
||
|
|
* @param settings - The merged config and user options
|
||
|
|
* @param visionProvider - The vision provider instance
|
||
|
|
* @param ttsProvider - The TTS provider instance
|
||
|
|
* @param stats - Stats object for tracking
|
||
|
|
*/
|
||
|
|
async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions = {}) {
|
||
|
|
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
|
||
|
|
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
|
||
|
|
// We'll hold the last batch's frames or last batch's description for context
|
||
|
|
let lastBatchContext = processingOptions.lastContext || {};
|
||
|
|
// Preload with existing segments if resuming
|
||
|
|
const audioSegments = processingOptions.existingSegments
|
||
|
|
? [...processingOptions.existingSegments]
|
||
|
|
: [];
|
||
|
|
// Track our current time position (will be adjusted for audio overlap)
|
||
|
|
let currentTimePosition = processingOptions.currentTimePosition || 0;
|
||
|
|
// Start from given index if resuming
|
||
|
|
const startBatchIndex = processingOptions.startIndex || 0;
|
||
|
|
// Track drift from the original schedule
|
||
|
|
let timelineDrift = 0;
|
||
|
|
const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
|
||
|
|
for (let batchIndex = startBatchIndex; batchIndex < totalBatches; batchIndex++) {
|
||
|
|
// Calculate ideal batch timing based on configuration
|
||
|
|
const idealBatchStart = batchIndex * settings.batchWindowDuration;
|
||
|
|
// Use adjusted time position that accounts for previous audio durations
|
||
|
|
const batchStart = currentTimePosition;
|
||
|
|
// Calculate drift from the original schedule
|
||
|
|
timelineDrift = batchStart - idealBatchStart;
|
||
|
|
// Log if drift is becoming significant
|
||
|
|
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||
|
|
console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
|
||
|
|
}
|
||
|
|
const batchEnd = idealBatchStart + settings.batchWindowDuration;
|
||
|
|
if (batchEnd > videoDuration)
|
||
|
|
break; // Safety check
|
||
|
|
console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
|
||
|
|
// Capture frames for this batch - use the ideal timing for frame capture
|
||
|
|
const framePaths = [];
|
||
|
|
for (let i = 0; i < settings.framesInBatch; i++) {
|
||
|
|
const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
|
||
|
|
const frameFilePath = path_1.default.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
|
||
|
|
(0, mediaUtils_1.captureVideoFrame)(videoFilePath, t, frameFilePath);
|
||
|
|
framePaths.push(frameFilePath);
|
||
|
|
}
|
||
|
|
// Use AI to describe this batch of frames, possibly providing some context
|
||
|
|
const result = await visionProvider.describeBatch(framePaths, lastBatchContext, settings.batchPrompt);
|
||
|
|
const description = result.description;
|
||
|
|
const usageStats = result.usage;
|
||
|
|
// Update stats
|
||
|
|
stats.totalVisionInputCost += usageStats.inputTokens;
|
||
|
|
stats.totalVisionOutputCost += usageStats.outputTokens;
|
||
|
|
stats.totalCost += usageStats.totalTokens;
|
||
|
|
console.log(`Batch #${batchIndex} description:\n${description}\n`);
|
||
|
|
// Convert description to TTS
|
||
|
|
const audioFilePath = path_1.default.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
|
||
|
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||
|
|
voice: settings.ttsVoice,
|
||
|
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||
|
|
speedFactor: settings.ttsSpeedFactor,
|
||
|
|
instructions: settings.ttsInstructions
|
||
|
|
});
|
||
|
|
const audioDuration = ttsResult.duration;
|
||
|
|
stats.totalTTSCost += ttsResult.cost;
|
||
|
|
console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
|
||
|
|
// Store segment info with the adjusted start time
|
||
|
|
const segment = {
|
||
|
|
audioFile: audioFilePath,
|
||
|
|
startTime: batchStart,
|
||
|
|
duration: audioDuration,
|
||
|
|
description
|
||
|
|
};
|
||
|
|
audioSegments.push(segment);
|
||
|
|
// Notify progress callback
|
||
|
|
if (processingOptions.onProgress) {
|
||
|
|
processingOptions.onProgress({
|
||
|
|
type: 'batch',
|
||
|
|
index: batchIndex,
|
||
|
|
total: totalBatches,
|
||
|
|
segment
|
||
|
|
});
|
||
|
|
}
|
||
|
|
// Update the time position for the next iteration
|
||
|
|
// Add a small buffer (0.5 sec) between descriptions
|
||
|
|
const bufferTime = 0.5;
|
||
|
|
currentTimePosition = batchStart + audioDuration + bufferTime;
|
||
|
|
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||
|
|
const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
|
||
|
|
if (currentTimePosition < nextIdealPosition) {
|
||
|
|
console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
|
||
|
|
currentTimePosition = nextIdealPosition;
|
||
|
|
timelineDrift = 0; // Reset drift since we've caught up
|
||
|
|
}
|
||
|
|
// Update lastBatchContext so the next batch can keep track of what's previously seen
|
||
|
|
lastBatchContext = {
|
||
|
|
lastDescription: description,
|
||
|
|
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
|
||
|
|
};
|
||
|
|
}
|
||
|
|
// Combine all the audio segments into one track
|
||
|
|
const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description_batch.mp3`);
|
||
|
|
(0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings);
|
||
|
|
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
|
||
|
|
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||
|
|
(0, stats_1.printStats)(stats, settings);
|
||
|
|
return {
|
||
|
|
videoFile: videoFilePath,
|
||
|
|
audioDescriptionFile: outputAudioPath,
|
||
|
|
segments: audioSegments
|
||
|
|
};
|
||
|
|
}
|
||
|
|
//# sourceMappingURL=processor.js.map
|