"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.generateAudioDescriptionFromOptions = generateAudioDescriptionFromOptions; exports.generateAudioDescription = generateAudioDescription; const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const config_1 = require("../config/config"); const stats_1 = require("../config/stats"); const visionProviderFactory_1 = require("../providers/vision/visionProviderFactory"); const ttsProviderFactory_1 = require("../providers/tts/ttsProviderFactory"); const mediaUtils_1 = require("./mediaUtils"); /** * High-level API: Generate audio description for a video with just options. * This internally creates providers and stats so callers don't need to. * * @param videoFilePath - Path to the input video file * @param options - Optional configuration overrides * @returns Result of the operation */ async function generateAudioDescriptionFromOptions(videoFilePath, options = {}, processingOptions = {}) { const config = { ...(0, config_1.getDefaultConfig)(), ...options }; if (!fs_1.default.existsSync(config.tempDir)) { fs_1.default.mkdirSync(config.tempDir, { recursive: true }); } if (!fs_1.default.existsSync(config.outputDir)) { fs_1.default.mkdirSync(config.outputDir, { recursive: true }); } const visionProvider = visionProviderFactory_1.VisionProviderFactory.getProvider(config); const ttsProvider = ttsProviderFactory_1.TTSProviderFactory.getProvider(config); const stats = (0, stats_1.createStats)(); return generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats, processingOptions); } /** * Generate audio description for a video (low-level API requiring pre-initialized providers). * @param videoFilePath - Path to the input video file * @param visionProvider - Vision provider instance * @param ttsProvider - TTS provider instance * @param options - Optional configuration overrides * @param stats - Stats object for tracking * @returns Result of the operation */ async function generateAudioDescription(videoFilePath, visionProvider, ttsProvider, options = {}, stats, processingOptions = {}) { // Merge provided options with defaults const settings = { ...options }; // Ensure temporary and output directories exist if (!fs_1.default.existsSync(settings.tempDir)) { fs_1.default.mkdirSync(settings.tempDir, { recursive: true }); } if (!fs_1.default.existsSync(settings.outputDir)) { fs_1.default.mkdirSync(settings.outputDir, { recursive: true }); } // Get video duration const videoDuration = (0, mediaUtils_1.getVideoDuration)(videoFilePath); stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); console.log(`Video duration: ${videoDuration} seconds`); // If batchTimeMode is enabled, use the new approach if (settings.batchTimeMode) { return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions); } // Calculate the number of frames to capture const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds); console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`); // Context window to store previous frames const frameContext = []; // Array to store audio segment information - preload with existing segments if resuming const audioSegments = processingOptions.existingSegments ? [...processingOptions.existingSegments] : []; // Track our current time position (will be adjusted for audio overlap) let currentTimePosition = processingOptions.currentTimePosition || 0; // Start from given index if resuming const startIndex = processingOptions.startIndex || 0; // Track drift from the original schedule let timelineDrift = 0; const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning // Process each frame for (let i = startIndex; i < totalFrames; i++) { // Calculate the ideal time position based on the original schedule const idealTimePosition = i * settings.captureIntervalSeconds; // Use the adjusted time position that accounts for previous audio durations const timePosition = currentTimePosition; // Calculate drift from the original schedule timelineDrift = timePosition - idealTimePosition; // Log if drift is becoming significant if (Math.abs(timelineDrift) > maxAllowableDrift) { console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`); } const frameFilePath = path_1.default.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`); // Capture frame at current time position (use the ideal time to capture the frame) (0, mediaUtils_1.captureVideoFrame)(videoFilePath, idealTimePosition, frameFilePath); console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`); // Add current frame to context const currentFrame = { index: i, path: frameFilePath, timePosition }; frameContext.push(currentFrame); // Keep context window at specified size if (frameContext.length > settings.contextWindowSize) { frameContext.shift(); } // Generate description let description; let usageStats; if (frameContext.length === 1) { // First frame - just describe what's in it const result = await visionProvider.describeImage(frameFilePath, settings.defaultPrompt); description = result.description; usageStats = result.usage; } else { // Compare with previous frame const previousFrame = frameContext[frameContext.length - 2]; const result = await visionProvider.compareImages(previousFrame.path, frameFilePath, settings.changePrompt); description = result.description; usageStats = result.usage; } // Update stats stats.totalVisionInputCost += usageStats.inputTokens; stats.totalVisionOutputCost += usageStats.outputTokens; stats.totalCost += usageStats.totalTokens; console.log(`Description: ${description}`); // Generate speech from description const audioFilePath = path_1.default.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`); const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { voice: settings.ttsVoice, model: settings.ttsProviders[settings.ttsProvider].model, speedFactor: settings.ttsSpeedFactor, instructions: settings.ttsInstructions }); const audioDuration = ttsResult.duration; stats.totalTTSCost += ttsResult.cost; console.log(`Audio duration: ${audioDuration} seconds`); // Store segment information const segment = { audioFile: audioFilePath, startTime: timePosition, duration: audioDuration, description }; audioSegments.push(segment); // Notify progress callback if (processingOptions.onProgress) { processingOptions.onProgress({ type: 'frame', index: i, total: totalFrames, segment }); } // Update the time position for the next iteration // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts const bufferTime = 0.25; currentTimePosition = timePosition + audioDuration + bufferTime; // If we've fallen behind schedule, try to catch up (but don't skip content) const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds; if (currentTimePosition < nextIdealPosition) { console.log(`Audio finished before next scheduled frame. Catching up with timeline.`); currentTimePosition = nextIdealPosition; timelineDrift = 0; // Reset drift since we've caught up } } // Combine audio segments into final audio description track const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description.mp3`); (0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings); // Clean up temporary files if desired // cleanupTempFiles(settings.tempDir); console.log(`\nAudio description generated: ${outputAudioPath}`); console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); (0, stats_1.printStats)(stats, settings); return { videoFile: videoFilePath, audioDescriptionFile: outputAudioPath, segments: audioSegments }; } /** * Generate audio description using the "batch time" mode with overlap prevention. * @param videoFilePath - Path to the input video file * @param videoDuration - Duration of the video in seconds * @param settings - The merged config and user options * @param visionProvider - The vision provider instance * @param ttsProvider - The TTS provider instance * @param stats - Stats object for tracking */ async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions = {}) { const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration); console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`); // We'll hold the last batch's frames or last batch's description for context let lastBatchContext = processingOptions.lastContext || {}; // Preload with existing segments if resuming const audioSegments = processingOptions.existingSegments ? [...processingOptions.existingSegments] : []; // Track our current time position (will be adjusted for audio overlap) let currentTimePosition = processingOptions.currentTimePosition || 0; // Start from given index if resuming const startBatchIndex = processingOptions.startIndex || 0; // Track drift from the original schedule let timelineDrift = 0; const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window for (let batchIndex = startBatchIndex; batchIndex < totalBatches; batchIndex++) { // Calculate ideal batch timing based on configuration const idealBatchStart = batchIndex * settings.batchWindowDuration; // Use adjusted time position that accounts for previous audio durations const batchStart = currentTimePosition; // Calculate drift from the original schedule timelineDrift = batchStart - idealBatchStart; // Log if drift is becoming significant if (Math.abs(timelineDrift) > maxAllowableDrift) { console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`); } const batchEnd = idealBatchStart + settings.batchWindowDuration; if (batchEnd > videoDuration) break; // Safety check console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`); // Capture frames for this batch - use the ideal timing for frame capture const framePaths = []; for (let i = 0; i < settings.framesInBatch; i++) { const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch; const frameFilePath = path_1.default.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`); (0, mediaUtils_1.captureVideoFrame)(videoFilePath, t, frameFilePath); framePaths.push(frameFilePath); } // Use AI to describe this batch of frames, possibly providing some context const result = await visionProvider.describeBatch(framePaths, lastBatchContext, settings.batchPrompt); const description = result.description; const usageStats = result.usage; // Update stats stats.totalVisionInputCost += usageStats.inputTokens; stats.totalVisionOutputCost += usageStats.outputTokens; stats.totalCost += usageStats.totalTokens; console.log(`Batch #${batchIndex} description:\n${description}\n`); // Convert description to TTS const audioFilePath = path_1.default.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`); const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, { voice: settings.ttsVoice, model: settings.ttsProviders[settings.ttsProvider].model, speedFactor: settings.ttsSpeedFactor, instructions: settings.ttsInstructions }); const audioDuration = ttsResult.duration; stats.totalTTSCost += ttsResult.cost; console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`); // Store segment info with the adjusted start time const segment = { audioFile: audioFilePath, startTime: batchStart, duration: audioDuration, description }; audioSegments.push(segment); // Notify progress callback if (processingOptions.onProgress) { processingOptions.onProgress({ type: 'batch', index: batchIndex, total: totalBatches, segment }); } // Update the time position for the next iteration // Add a small buffer (0.5 sec) between descriptions const bufferTime = 0.5; currentTimePosition = batchStart + audioDuration + bufferTime; // If we've fallen behind schedule, try to catch up (but don't skip content) const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration; if (currentTimePosition < nextIdealPosition) { console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`); currentTimePosition = nextIdealPosition; timelineDrift = 0; // Reset drift since we've caught up } // Update lastBatchContext so the next batch can keep track of what's previously seen lastBatchContext = { lastDescription: description, lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch }; } // Combine all the audio segments into one track const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description_batch.mp3`); (0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings); console.log(`\nBatch audio description generated: ${outputAudioPath}`); console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`); (0, stats_1.printStats)(stats, settings); return { videoFile: videoFilePath, audioDescriptionFile: outputAudioPath, segments: audioSegments }; } //# sourceMappingURL=processor.js.map