Rewrite frontend as single self-contained HTML file — all CSS/JS inline, no external files to fail loading

2026-05-13 17:24:10 +02:00
parent 3432d362e2
commit ddb0f88257
116 changed files with 4240 additions and 921 deletions
--- a/dist/utils/processor.js
+++ b/dist/utils/processor.js
@@ -0,0 +1,295 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.generateAudioDescriptionFromOptions = generateAudioDescriptionFromOptions;
+exports.generateAudioDescription = generateAudioDescription;
+const fs_1 = __importDefault(require("fs"));
+const path_1 = __importDefault(require("path"));
+const config_1 = require("../config/config");
+const stats_1 = require("../config/stats");
+const visionProviderFactory_1 = require("../providers/vision/visionProviderFactory");
+const ttsProviderFactory_1 = require("../providers/tts/ttsProviderFactory");
+const mediaUtils_1 = require("./mediaUtils");
+/**
+ * High-level API: Generate audio description for a video with just options.
+ * This internally creates providers and stats so callers don't need to.
+ *
+ * @param videoFilePath - Path to the input video file
+ * @param options - Optional configuration overrides
+ * @returns Result of the operation
+ */
+async function generateAudioDescriptionFromOptions(videoFilePath, options = {}, processingOptions = {}) {
+    const config = { ...(0, config_1.getDefaultConfig)(), ...options };
+    if (!fs_1.default.existsSync(config.tempDir)) {
+        fs_1.default.mkdirSync(config.tempDir, { recursive: true });
+    }
+    if (!fs_1.default.existsSync(config.outputDir)) {
+        fs_1.default.mkdirSync(config.outputDir, { recursive: true });
+    }
+    const visionProvider = visionProviderFactory_1.VisionProviderFactory.getProvider(config);
+    const ttsProvider = ttsProviderFactory_1.TTSProviderFactory.getProvider(config);
+    const stats = (0, stats_1.createStats)();
+    return generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats, processingOptions);
+}
+/**
+ * Generate audio description for a video (low-level API requiring pre-initialized providers).
+ * @param videoFilePath - Path to the input video file
+ * @param visionProvider - Vision provider instance
+ * @param ttsProvider - TTS provider instance
+ * @param options - Optional configuration overrides
+ * @param stats - Stats object for tracking
+ * @returns Result of the operation
+ */
+async function generateAudioDescription(videoFilePath, visionProvider, ttsProvider, options = {}, stats, processingOptions = {}) {
+    // Merge provided options with defaults
+    const settings = { ...options };
+    // Ensure temporary and output directories exist
+    if (!fs_1.default.existsSync(settings.tempDir)) {
+        fs_1.default.mkdirSync(settings.tempDir, { recursive: true });
+    }
+    if (!fs_1.default.existsSync(settings.outputDir)) {
+        fs_1.default.mkdirSync(settings.outputDir, { recursive: true });
+    }
+    // Get video duration
+    const videoDuration = (0, mediaUtils_1.getVideoDuration)(videoFilePath);
+    stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+    console.log(`Video duration: ${videoDuration} seconds`);
+    // If batchTimeMode is enabled, use the new approach
+    if (settings.batchTimeMode) {
+        return await generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions);
+    }
+    // Calculate the number of frames to capture
+    const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
+    console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
+    // Context window to store previous frames
+    const frameContext = [];
+    // Array to store audio segment information - preload with existing segments if resuming
+    const audioSegments = processingOptions.existingSegments
+        ? [...processingOptions.existingSegments]
+        : [];
+    // Track our current time position (will be adjusted for audio overlap)
+    let currentTimePosition = processingOptions.currentTimePosition || 0;
+    // Start from given index if resuming
+    const startIndex = processingOptions.startIndex || 0;
+    // Track drift from the original schedule
+    let timelineDrift = 0;
+    const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
+    // Process each frame
+    for (let i = startIndex; i < totalFrames; i++) {
+        // Calculate the ideal time position based on the original schedule
+        const idealTimePosition = i * settings.captureIntervalSeconds;
+        // Use the adjusted time position that accounts for previous audio durations
+        const timePosition = currentTimePosition;
+        // Calculate drift from the original schedule
+        timelineDrift = timePosition - idealTimePosition;
+        // Log if drift is becoming significant
+        if (Math.abs(timelineDrift) > maxAllowableDrift) {
+            console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
+        }
+        const frameFilePath = path_1.default.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
+        // Capture frame at current time position (use the ideal time to capture the frame)
+        (0, mediaUtils_1.captureVideoFrame)(videoFilePath, idealTimePosition, frameFilePath);
+        console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
+        // Add current frame to context
+        const currentFrame = {
+            index: i,
+            path: frameFilePath,
+            timePosition
+        };
+        frameContext.push(currentFrame);
+        // Keep context window at specified size
+        if (frameContext.length > settings.contextWindowSize) {
+            frameContext.shift();
+        }
+        // Generate description
+        let description;
+        let usageStats;
+        if (frameContext.length === 1) {
+            // First frame - just describe what's in it
+            const result = await visionProvider.describeImage(frameFilePath, settings.defaultPrompt);
+            description = result.description;
+            usageStats = result.usage;
+        }
+        else {
+            // Compare with previous frame
+            const previousFrame = frameContext[frameContext.length - 2];
+            const result = await visionProvider.compareImages(previousFrame.path, frameFilePath, settings.changePrompt);
+            description = result.description;
+            usageStats = result.usage;
+        }
+        // Update stats
+        stats.totalVisionInputCost += usageStats.inputTokens;
+        stats.totalVisionOutputCost += usageStats.outputTokens;
+        stats.totalCost += usageStats.totalTokens;
+        console.log(`Description: ${description}`);
+        // Generate speech from description
+        const audioFilePath = path_1.default.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
+        const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
+            voice: settings.ttsVoice,
+            model: settings.ttsProviders[settings.ttsProvider].model,
+            speedFactor: settings.ttsSpeedFactor,
+            instructions: settings.ttsInstructions
+        });
+        const audioDuration = ttsResult.duration;
+        stats.totalTTSCost += ttsResult.cost;
+        console.log(`Audio duration: ${audioDuration} seconds`);
+        // Store segment information
+        const segment = {
+            audioFile: audioFilePath,
+            startTime: timePosition,
+            duration: audioDuration,
+            description
+        };
+        audioSegments.push(segment);
+        // Notify progress callback
+        if (processingOptions.onProgress) {
+            processingOptions.onProgress({
+                type: 'frame',
+                index: i,
+                total: totalFrames,
+                segment
+            });
+        }
+        // Update the time position for the next iteration
+        // Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
+        const bufferTime = 0.25;
+        currentTimePosition = timePosition + audioDuration + bufferTime;
+        // If we've fallen behind schedule, try to catch up (but don't skip content)
+        const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
+        if (currentTimePosition < nextIdealPosition) {
+            console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
+            currentTimePosition = nextIdealPosition;
+            timelineDrift = 0; // Reset drift since we've caught up
+        }
+    }
+    // Combine audio segments into final audio description track
+    const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description.mp3`);
+    (0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings);
+    // Clean up temporary files if desired
+    // cleanupTempFiles(settings.tempDir);
+    console.log(`\nAudio description generated: ${outputAudioPath}`);
+    console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+    (0, stats_1.printStats)(stats, settings);
+    return {
+        videoFile: videoFilePath,
+        audioDescriptionFile: outputAudioPath,
+        segments: audioSegments
+    };
+}
+/**
+ * Generate audio description using the "batch time" mode with overlap prevention.
+ * @param videoFilePath - Path to the input video file
+ * @param videoDuration - Duration of the video in seconds
+ * @param settings - The merged config and user options
+ * @param visionProvider - The vision provider instance
+ * @param ttsProvider - The TTS provider instance
+ * @param stats - Stats object for tracking
+ */
+async function generateAudioDescriptionBatch(videoFilePath, videoDuration, settings, visionProvider, ttsProvider, stats, processingOptions = {}) {
+    const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
+    console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
+    // We'll hold the last batch's frames or last batch's description for context
+    let lastBatchContext = processingOptions.lastContext || {};
+    // Preload with existing segments if resuming
+    const audioSegments = processingOptions.existingSegments
+        ? [...processingOptions.existingSegments]
+        : [];
+    // Track our current time position (will be adjusted for audio overlap)
+    let currentTimePosition = processingOptions.currentTimePosition || 0;
+    // Start from given index if resuming
+    const startBatchIndex = processingOptions.startIndex || 0;
+    // Track drift from the original schedule
+    let timelineDrift = 0;
+    const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
+    for (let batchIndex = startBatchIndex; batchIndex < totalBatches; batchIndex++) {
+        // Calculate ideal batch timing based on configuration
+        const idealBatchStart = batchIndex * settings.batchWindowDuration;
+        // Use adjusted time position that accounts for previous audio durations
+        const batchStart = currentTimePosition;
+        // Calculate drift from the original schedule
+        timelineDrift = batchStart - idealBatchStart;
+        // Log if drift is becoming significant
+        if (Math.abs(timelineDrift) > maxAllowableDrift) {
+            console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
+        }
+        const batchEnd = idealBatchStart + settings.batchWindowDuration;
+        if (batchEnd > videoDuration)
+            break; // Safety check
+        console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
+        // Capture frames for this batch - use the ideal timing for frame capture
+        const framePaths = [];
+        for (let i = 0; i < settings.framesInBatch; i++) {
+            const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
+            const frameFilePath = path_1.default.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
+            (0, mediaUtils_1.captureVideoFrame)(videoFilePath, t, frameFilePath);
+            framePaths.push(frameFilePath);
+        }
+        // Use AI to describe this batch of frames, possibly providing some context
+        const result = await visionProvider.describeBatch(framePaths, lastBatchContext, settings.batchPrompt);
+        const description = result.description;
+        const usageStats = result.usage;
+        // Update stats
+        stats.totalVisionInputCost += usageStats.inputTokens;
+        stats.totalVisionOutputCost += usageStats.outputTokens;
+        stats.totalCost += usageStats.totalTokens;
+        console.log(`Batch #${batchIndex} description:\n${description}\n`);
+        // Convert description to TTS
+        const audioFilePath = path_1.default.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
+        const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
+            voice: settings.ttsVoice,
+            model: settings.ttsProviders[settings.ttsProvider].model,
+            speedFactor: settings.ttsSpeedFactor,
+            instructions: settings.ttsInstructions
+        });
+        const audioDuration = ttsResult.duration;
+        stats.totalTTSCost += ttsResult.cost;
+        console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
+        // Store segment info with the adjusted start time
+        const segment = {
+            audioFile: audioFilePath,
+            startTime: batchStart,
+            duration: audioDuration,
+            description
+        };
+        audioSegments.push(segment);
+        // Notify progress callback
+        if (processingOptions.onProgress) {
+            processingOptions.onProgress({
+                type: 'batch',
+                index: batchIndex,
+                total: totalBatches,
+                segment
+            });
+        }
+        // Update the time position for the next iteration
+        // Add a small buffer (0.5 sec) between descriptions
+        const bufferTime = 0.5;
+        currentTimePosition = batchStart + audioDuration + bufferTime;
+        // If we've fallen behind schedule, try to catch up (but don't skip content)
+        const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
+        if (currentTimePosition < nextIdealPosition) {
+            console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
+            currentTimePosition = nextIdealPosition;
+            timelineDrift = 0; // Reset drift since we've caught up
+        }
+        // Update lastBatchContext so the next batch can keep track of what's previously seen
+        lastBatchContext = {
+            lastDescription: description,
+            lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
+        };
+    }
+    // Combine all the audio segments into one track
+    const outputAudioPath = path_1.default.join(settings.outputDir, `${path_1.default.basename(videoFilePath, path_1.default.extname(videoFilePath))}_description_batch.mp3`);
+    (0, mediaUtils_1.combineAudioSegments)(audioSegments, outputAudioPath, videoDuration, settings);
+    console.log(`\nBatch audio description generated: ${outputAudioPath}`);
+    console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
+    (0, stats_1.printStats)(stats, settings);
+    return {
+        videoFile: videoFilePath,
+        audioDescriptionFile: outputAudioPath,
+        segments: audioSegments
+    };
+}
+//# sourceMappingURL=processor.js.map