Compare commits
10 Commits
other-prov
...
3a198d7d50
| Author | SHA1 | Date | |
|---|---|---|---|
| 3a198d7d50 | |||
| f05e57493c | |||
| 6e9a26557f | |||
| eb15af3a36 | |||
| 19975917c5 | |||
| fc02d2001c | |||
| 507d4f6474 | |||
| 9425b4b256 | |||
| 6ffb3f45ba | |||
| 5312410d7e |
15
.eslintrc.js
Normal file
15
.eslintrc.js
Normal file
@@ -0,0 +1,15 @@
|
||||
module.exports = {
|
||||
parser: '@typescript-eslint/parser',
|
||||
extends: [
|
||||
'plugin:@typescript-eslint/recommended',
|
||||
],
|
||||
parserOptions: {
|
||||
ecmaVersion: 2020,
|
||||
sourceType: 'module',
|
||||
},
|
||||
rules: {
|
||||
'@typescript-eslint/explicit-function-return-type': 'off',
|
||||
'@typescript-eslint/no-explicit-any': 'off',
|
||||
'@typescript-eslint/no-unused-vars': ['warn', { argsIgnorePattern: '^_' }],
|
||||
},
|
||||
};
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,2 +1,3 @@
|
||||
.env
|
||||
node_modules
|
||||
desc/
|
||||
15
env.example
15
env.example
@@ -1,8 +1,9 @@
|
||||
# OpenAI API Key
|
||||
OPENAI_API_KEY=meow
|
||||
# OpenAI API KEY (for vision and TTS)
|
||||
OPENAI_API_KEY=your_openai_api_key_here
|
||||
|
||||
# Optional configuration overrides or whatever but you probably gonna put this in settings.json or use command line params
|
||||
CAPTURE_INTERVAL_SECONDS=10
|
||||
CONTEXT_WINDOW_SIZE=5
|
||||
TTS_MODEL=tts-1-hd
|
||||
TTS_VOICE=alloy
|
||||
# Google API KEY (for Gemini vision)
|
||||
GOOGLE_API_KEY=your_google_api_key_here
|
||||
|
||||
# Custom paths (optional)
|
||||
# OUTPUT_DIR=./desc/output/
|
||||
# TEMP_DIR=./desc/tmp/
|
||||
|
||||
156
index.js
156
index.js
@@ -20,8 +20,8 @@ const defaultConfig = {
|
||||
batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
|
||||
|
||||
// Vision AI settings
|
||||
visionProvider: "openai",
|
||||
visionModel: "gpt-4o",
|
||||
visionProvider: "gemini",
|
||||
visionModel: "gemini-3.0-flash",
|
||||
visionProviders: {
|
||||
openai: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
@@ -32,6 +32,12 @@ const defaultConfig = {
|
||||
apiKey: process.env.GOOGLE_API_KEY,
|
||||
model: "gemini-2.0-flash",
|
||||
maxTokens: 300
|
||||
},
|
||||
ollama: {
|
||||
// Example config; adjust to match your local Ollama setup
|
||||
baseUrl: "http://localhost:11434", // or wherever Ollama is hosted
|
||||
model: "gemma3:12b",
|
||||
maxTokens: 3000
|
||||
}
|
||||
// Add other vision providers here
|
||||
},
|
||||
@@ -50,8 +56,8 @@ const defaultConfig = {
|
||||
},
|
||||
|
||||
// Video processing settings
|
||||
outputDir: "/mnt/e/desc/output/",
|
||||
tempDir: "/mnt/e/desc/temp/",
|
||||
outputDir: "./desc/output/",
|
||||
tempDir: "./desc/tmp/",
|
||||
batchTimeMode: true, // Whether to use the new batch time mode
|
||||
batchWindowDuration: 15, // How many seconds each batch covers
|
||||
framesInBatch: 10, // How many frames to capture within each batch
|
||||
@@ -83,6 +89,8 @@ class VisionProviderFactory {
|
||||
return new OpenAIVisionProvider(providerConfig);
|
||||
case 'gemini':
|
||||
return new GeminiVisionProvider(providerConfig);
|
||||
case "ollama":
|
||||
return new OllamaVisionProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
||||
@@ -136,6 +144,7 @@ class OpenAIVisionProvider {
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
@@ -467,6 +476,145 @@ class GeminiVisionProvider {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Ollama Vision Provider Implementation
|
||||
* See: https://github.com/ollama/ollama/blob/main/docs/api.md
|
||||
*/
|
||||
class OllamaVisionProvider {
|
||||
constructor(config) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: config.baseUrl || "http://localhost:11451",
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param {string} imagePath - Path to the image file
|
||||
* @param {string} prompt - Prompt for the AI
|
||||
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath, prompt) {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [base64Image],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.1
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: {
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
totalTokens: 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeImage error:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe differences
|
||||
* @param {string} image1Path - Path to the first image
|
||||
* @param {string} image2Path - Path to the second image
|
||||
* @param {string} prompt - Prompt for the AI
|
||||
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path, image2Path, prompt) {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path).toString('base64');
|
||||
const image2Data = fs.readFileSync(image2Path).toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [image1Data, image2Data],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama compareImages error:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param {string[]} imagePaths - Array of paths to the images
|
||||
* @param {object} lastBatchContext - Context from the previous batch (optional)
|
||||
* @param {string} prompt - Prompt for the AI
|
||||
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
|
||||
*/
|
||||
async describeBatch(imagePaths, lastBatchContext, prompt) {
|
||||
try {
|
||||
let userPrompt = prompt;
|
||||
|
||||
// If there's context, prepend it. This helps maintain a storyline across batches.
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
userPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
|
||||
}
|
||||
|
||||
// Convert images to base64
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: userPrompt,
|
||||
images: imagesBase64,
|
||||
stream: false,
|
||||
/*options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}*/
|
||||
}, {
|
||||
timeout: 120000 // Timeout in milliseconds, e.g., 5000 ms = 5 seconds
|
||||
}); const combinedText = response.data.response || "";
|
||||
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeBatch error:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider Implementation
|
||||
*/
|
||||
|
||||
7425
package-lock.json
generated
7425
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
71
package.json
71
package.json
@@ -1,20 +1,55 @@
|
||||
{
|
||||
"name": "video-audio-description-generator",
|
||||
"version": "1.0.0",
|
||||
"description": "Generate AI-powered audio descriptions for video content",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@google/generative-ai": "^0.24.0",
|
||||
"axios": "^1.6.2",
|
||||
"dotenv": "^16.3.1",
|
||||
"fluent-ffmpeg": "^2.1.2",
|
||||
"openai": "^4.20.1",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
"name": "video-audio-description-generator",
|
||||
"version": "1.0.0",
|
||||
"description": "Generate AI-powered audio descriptions for video content",
|
||||
"main": "dist/index.js",
|
||||
"types": "dist/index.d.ts",
|
||||
"bin": {
|
||||
"aidio-desc": "./dist/cli/index.js"
|
||||
},
|
||||
"files": [
|
||||
"dist"
|
||||
],
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"start": "node dist/cli/index.js",
|
||||
"dev": "ts-node src/cli/index.ts",
|
||||
"test": "jest",
|
||||
"lint": "eslint src/**/*.ts",
|
||||
"prepublishOnly": "npm run build"
|
||||
},
|
||||
"dependencies": {
|
||||
"@google-cloud/text-to-speech": "^6.4.1",
|
||||
"@google/generative-ai": "^0.24.0",
|
||||
"axios": "^1.6.2",
|
||||
"dotenv": "^16.3.1",
|
||||
"fluent-ffmpeg": "^2.1.2",
|
||||
"openai": "^4.20.1",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/fluent-ffmpeg": "^2.1.24",
|
||||
"@types/jest": "^29.5.5",
|
||||
"@types/node": "^20.8.2",
|
||||
"@types/yargs": "^17.0.32",
|
||||
"@typescript-eslint/eslint-plugin": "^6.7.4",
|
||||
"@typescript-eslint/parser": "^6.7.4",
|
||||
"eslint": "^8.50.0",
|
||||
"jest": "^29.7.0",
|
||||
"ts-jest": "^29.1.1",
|
||||
"ts-node": "^10.9.1",
|
||||
"typescript": "^5.2.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
},
|
||||
"keywords": [
|
||||
"video",
|
||||
"audio",
|
||||
"description",
|
||||
"ai",
|
||||
"accessibility"
|
||||
],
|
||||
"author": "",
|
||||
"license": "MIT"
|
||||
}
|
||||
|
||||
BIN
src/.DS_Store
vendored
Normal file
BIN
src/.DS_Store
vendored
Normal file
Binary file not shown.
141
src/cli/args.ts
Normal file
141
src/cli/args.ts
Normal file
@@ -0,0 +1,141 @@
|
||||
import yargs from 'yargs/yargs';
|
||||
import { hideBin } from 'yargs/helpers';
|
||||
|
||||
export interface CLIArgs {
|
||||
_: (string | number)[];
|
||||
$0: string;
|
||||
video_file_path?: string;
|
||||
captureIntervalSeconds?: number;
|
||||
contextWindowSize?: number;
|
||||
visionProvider?: string;
|
||||
visionModel?: string;
|
||||
ttsProvider?: string;
|
||||
ttsModel?: string;
|
||||
ttsVoice?: string;
|
||||
ttsSpeedFactor?: number;
|
||||
ttsInstructions?: string;
|
||||
outputDir?: string;
|
||||
tempDir?: string;
|
||||
batchTimeMode?: boolean;
|
||||
batchWindowDuration?: number;
|
||||
framesInBatch?: number;
|
||||
defaultPrompt?: string;
|
||||
changePrompt?: string;
|
||||
batchPrompt?: string;
|
||||
estimate?: boolean;
|
||||
config?: string;
|
||||
saveConfig?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse command line arguments
|
||||
*/
|
||||
export function parseCommandLineArgs(): CLIArgs {
|
||||
const parsed = yargs(hideBin(process.argv))
|
||||
.usage('Usage: $0 <video_file_path> [options]')
|
||||
.positional('video_file_path', {
|
||||
describe: 'Path to the input video file',
|
||||
type: 'string'
|
||||
})
|
||||
.option('captureIntervalSeconds', {
|
||||
alias: 'i',
|
||||
describe: 'Interval in seconds between frame captures',
|
||||
type: 'number'
|
||||
})
|
||||
.option('contextWindowSize', {
|
||||
alias: 'c',
|
||||
describe: 'Number of frames to keep in context',
|
||||
type: 'number'
|
||||
})
|
||||
.option('visionProvider', {
|
||||
describe: 'Provider to use for vision AI',
|
||||
type: 'string'
|
||||
})
|
||||
.option('visionModel', {
|
||||
describe: 'Model to use for vision AI',
|
||||
type: 'string'
|
||||
})
|
||||
.option('ttsProvider', {
|
||||
describe: 'Provider to use for text-to-speech',
|
||||
type: 'string'
|
||||
})
|
||||
.option('ttsModel', {
|
||||
alias: 'm',
|
||||
describe: 'TTS model to use',
|
||||
type: 'string'
|
||||
})
|
||||
.option('ttsVoice', {
|
||||
alias: 'v',
|
||||
describe: 'Voice to use for text-to-speech',
|
||||
type: 'string'
|
||||
})
|
||||
.option('ttsSpeedFactor', {
|
||||
alias: 's',
|
||||
describe: 'Speed factor for the audio playback',
|
||||
type: 'number'
|
||||
})
|
||||
.option('ttsInstructions', {
|
||||
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
|
||||
type: 'string'
|
||||
})
|
||||
.option('outputDir', {
|
||||
alias: 'o',
|
||||
describe: 'Directory for output files',
|
||||
type: 'string'
|
||||
})
|
||||
.option('tempDir', {
|
||||
alias: 't',
|
||||
describe: 'Directory for temporary files',
|
||||
type: 'string'
|
||||
})
|
||||
.option('batchTimeMode', {
|
||||
alias: 'b',
|
||||
describe: 'Use batch time mode for processing',
|
||||
type: 'boolean'
|
||||
})
|
||||
.option('batchWindowDuration', {
|
||||
describe: 'Duration in seconds for each batch window',
|
||||
type: 'number'
|
||||
})
|
||||
.option('framesInBatch', {
|
||||
describe: 'Number of frames to capture within each batch',
|
||||
type: 'number'
|
||||
})
|
||||
.option('defaultPrompt', {
|
||||
describe: 'Prompt for describing individual frames',
|
||||
type: 'string'
|
||||
})
|
||||
.option('changePrompt', {
|
||||
describe: 'Prompt for describing changes between frames',
|
||||
type: 'string'
|
||||
})
|
||||
.option('batchPrompt', {
|
||||
describe: 'Prompt for describing batches of frames',
|
||||
type: 'string'
|
||||
})
|
||||
.option('estimate', {
|
||||
alias: 'e',
|
||||
describe: 'Only estimate the cost without generating the audio description',
|
||||
type: 'boolean',
|
||||
default: false
|
||||
})
|
||||
.option('config', {
|
||||
alias: 'f',
|
||||
describe: 'Path to JSON config file',
|
||||
type: 'string'
|
||||
})
|
||||
.option('saveConfig', {
|
||||
describe: 'Save current configuration to specified JSON file',
|
||||
type: 'string'
|
||||
})
|
||||
.help()
|
||||
.alias('help', 'h')
|
||||
.example('$0 video.mp4', 'Process a video with default settings')
|
||||
.example('$0 video.mp4 --ttsVoice nova --visionProvider openai', 'Process with custom voice and vision provider')
|
||||
.example('$0 video.mp4 --estimate', 'Only estimate the processing cost')
|
||||
.example('$0 video.mp4 --config myconfig.json', 'Use settings from a config file')
|
||||
.example('$0 video.mp4 --saveConfig myconfig.json', 'Save current settings to a config file')
|
||||
.argv as unknown as CLIArgs;
|
||||
|
||||
return parsed;
|
||||
}
|
||||
90
src/cli/index.ts
Normal file
90
src/cli/index.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import 'dotenv/config';
|
||||
import { getDefaultConfig } from '../config/config';
|
||||
import { createStats } from '../config/stats';
|
||||
import { VisionProviderFactory } from '../providers/vision/visionProviderFactory';
|
||||
import { TTSProviderFactory } from '../providers/tts/ttsProviderFactory';
|
||||
import { generateAudioDescription } from '../utils/processor';
|
||||
import { estimateCost } from '../utils/costEstimator';
|
||||
import { loadConfigFromFile, saveConfigToFile } from '../utils/configUtils';
|
||||
import { parseCommandLineArgs } from './args';
|
||||
import { Config } from '../config/config';
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const argv = parseCommandLineArgs();
|
||||
|
||||
let config: Config = getDefaultConfig();
|
||||
|
||||
if (argv.config) {
|
||||
const fileConfig = loadConfigFromFile(argv.config);
|
||||
config = { ...config, ...fileConfig };
|
||||
}
|
||||
|
||||
const argvObj = argv as unknown as Record<string, unknown>;
|
||||
Object.keys(argvObj).forEach(key => {
|
||||
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||||
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||||
argvObj[key] !== undefined) {
|
||||
(config as any)[key] = argvObj[key];
|
||||
}
|
||||
});
|
||||
|
||||
if (argv.visionModel) {
|
||||
if (!config.visionProviders[config.visionProvider]) {
|
||||
config.visionProviders[config.visionProvider] = { model: '' };
|
||||
}
|
||||
config.visionProviders[config.visionProvider].model = argv.visionModel;
|
||||
}
|
||||
|
||||
if (argv.ttsModel) {
|
||||
if (!config.ttsProviders[config.ttsProvider]) {
|
||||
config.ttsProviders[config.ttsProvider] = { model: '' };
|
||||
}
|
||||
config.ttsProviders[config.ttsProvider].model = argv.ttsModel;
|
||||
}
|
||||
|
||||
if (argv.ttsVoice) {
|
||||
if (!config.ttsProviders[config.ttsProvider]) {
|
||||
config.ttsProviders[config.ttsProvider] = { model: '', voice: '' };
|
||||
}
|
||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||
}
|
||||
|
||||
if (argv.saveConfig) {
|
||||
saveConfigToFile(argv.saveConfig, config);
|
||||
}
|
||||
|
||||
if (argv._.length < 1) {
|
||||
console.error('Error: No video file specified');
|
||||
console.log('Usage: node script.js <video_file_path> [options]');
|
||||
console.log('Use --help for more information');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const videoFilePath = String(argv._[0]);
|
||||
|
||||
if (argv.estimate) {
|
||||
try {
|
||||
const costBreakdown = await estimateCost(videoFilePath, config);
|
||||
console.log('\n=== COST ESTIMATION ===');
|
||||
console.log(JSON.stringify(costBreakdown, null, 2));
|
||||
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
||||
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
||||
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
||||
} catch (err) {
|
||||
console.error('Error estimating costs:', err);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const stats = createStats();
|
||||
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||
await generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||
} catch (err) {
|
||||
console.error('Error generating audio description:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
main().catch(err => console.error('Unhandled error:', err));
|
||||
}
|
||||
109
src/config/config.ts
Normal file
109
src/config/config.ts
Normal file
@@ -0,0 +1,109 @@
|
||||
import { VisionProviderConfig, TTSProviderConfig } from '../interfaces';
|
||||
|
||||
export interface Config {
|
||||
captureIntervalSeconds: number;
|
||||
contextWindowSize: number;
|
||||
defaultPrompt: string;
|
||||
changePrompt: string;
|
||||
batchPrompt: string;
|
||||
|
||||
// Vision AI settings
|
||||
visionProvider: string;
|
||||
visionModel: string;
|
||||
visionProviders: {
|
||||
[key: string]: VisionProviderConfig;
|
||||
};
|
||||
|
||||
// TTS settings
|
||||
ttsProvider: string;
|
||||
ttsVoice: string;
|
||||
ttsSpeedFactor: number;
|
||||
ttsInstructions?: string;
|
||||
ttsProviders: {
|
||||
[key: string]: TTSProviderConfig;
|
||||
};
|
||||
|
||||
// Video processing settings
|
||||
outputDir: string;
|
||||
tempDir: string;
|
||||
batchTimeMode: boolean;
|
||||
batchWindowDuration: number;
|
||||
framesInBatch: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get default configuration options.
|
||||
* Uses a function so that process.env is read at call time
|
||||
* (after dotenv has been loaded), not at module import time.
|
||||
*/
|
||||
export function getDefaultConfig(): Config {
|
||||
return {
|
||||
captureIntervalSeconds: 10,
|
||||
contextWindowSize: 5,
|
||||
defaultPrompt: "Describe this frame from a video in 1-2 sentences for someone who cannot see it. Focus on key visual elements. Avoid using terms like 'in this frame', simply describe the actual frame. Keep sentences short and concise, as this will be used to generate an audio track which is overlayed on the video.",
|
||||
changePrompt: "Describe what has changed between these frames in 1-2 sentences for someone who cannot see the video. Focus on significant visual changes only. Avoid talking about meta information such as 'in this frame', or 'the significant change is', and merely describe the actual change taking place. Only describe the changes relevant to the last frame. The previous frames are attached for you to build context and build situational awareness. Keep it short and concise, as your text will be used to generate audio description tracks to be played with the video.",
|
||||
batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
|
||||
|
||||
// Vision AI settings
|
||||
visionProvider: "openai",
|
||||
visionModel: "gpt-5.4-mini",
|
||||
visionProviders: {
|
||||
openai: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
model: "gpt-5.4-mini",
|
||||
maxTokens: 300
|
||||
},
|
||||
gemini: {
|
||||
apiKey: process.env.GOOGLE_API_KEY,
|
||||
model: "gemini-2.0-flash",
|
||||
maxTokens: 300
|
||||
},
|
||||
ollama: {
|
||||
baseUrl: "http://localhost:11434",
|
||||
model: "gemma3:12b",
|
||||
maxTokens: 3000
|
||||
},
|
||||
openrouter: {
|
||||
apiKey: process.env.OPENROUTER_API_KEY,
|
||||
model: "anthropic/claude-sonnet-4.5",
|
||||
baseUrl: "https://openrouter.ai/api/v1",
|
||||
maxTokens: 300
|
||||
}
|
||||
},
|
||||
|
||||
// TTS settings
|
||||
ttsProvider: "openai",
|
||||
ttsVoice: "alloy",
|
||||
ttsSpeedFactor: 1.5,
|
||||
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
|
||||
ttsProviders: {
|
||||
openai: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy"
|
||||
},
|
||||
elevenlabs: {
|
||||
apiKey: process.env.ELEVENLABS_API_KEY,
|
||||
model: "eleven_multilingual_v2",
|
||||
voice: "JBFqnCBsd6RMkjVDRZzb"
|
||||
},
|
||||
google: {
|
||||
apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
|
||||
keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
|
||||
model: "chirp-hd",
|
||||
voice: "en-US-Chirp-HD-F"
|
||||
}
|
||||
},
|
||||
|
||||
// Video processing settings
|
||||
outputDir: "./desc/output/",
|
||||
tempDir: "./desc/tmp/",
|
||||
batchTimeMode: true,
|
||||
batchWindowDuration: 15,
|
||||
framesInBatch: 10,
|
||||
};
|
||||
}
|
||||
|
||||
// Keep a static export alias for backward compatibility
|
||||
// (but callers should prefer getDefaultConfig() for correct env loading)
|
||||
export const defaultConfig = getDefaultConfig();
|
||||
2
src/config/index.ts
Normal file
2
src/config/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export { Config, getDefaultConfig, defaultConfig } from './config';
|
||||
export { createStats, printStats } from './stats';
|
||||
79
src/config/stats.ts
Normal file
79
src/config/stats.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import { Stats } from '../interfaces';
|
||||
import { Config } from './config';
|
||||
|
||||
// Initialize stats object
|
||||
export const createStats = (): Stats => ({
|
||||
totalFrames: 0,
|
||||
totalBatches: 0,
|
||||
totalVisionInputCost: 0,
|
||||
totalVisionOutputCost: 0,
|
||||
totalTTSCost: 0,
|
||||
totalCost: 0
|
||||
});
|
||||
|
||||
// Pricing constants (as of March 2025)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, number>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': {
|
||||
input: 0.0025,
|
||||
output: 0.01
|
||||
}
|
||||
},
|
||||
gemini: {
|
||||
'gemini-pro-vision': {
|
||||
input: 0.0025,
|
||||
output: 0.0025
|
||||
}
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Print out statistics
|
||||
* @param stats - Statistics object
|
||||
* @param settings - Configuration settings
|
||||
*/
|
||||
export function printStats(stats: Stats, settings: Config): void {
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Calculate prices using available pricing data
|
||||
const visionInputCost = visionPricing ? (stats.totalVisionInputCost * visionPricing.input / 1000) : 0;
|
||||
const visionOutputCost = visionPricing ? (stats.totalVisionOutputCost * visionPricing.output / 1000) : 0;
|
||||
const ttsCost = ttsPricing ? (stats.totalTTSCost * (ttsPricing as number) / 1000) : 0;
|
||||
const totalCost = visionInputCost + visionOutputCost + ttsCost;
|
||||
|
||||
// Print out the stats
|
||||
console.log('\n=== STATISTICS ===');
|
||||
console.log(`Vision provider: ${visionProvider}, Model: ${visionModel}`);
|
||||
console.log(`TTS provider: ${ttsProvider}, Model: ${ttsModel}`);
|
||||
console.log(`Total vision input cost: ${visionInputCost.toFixed(4)}`);
|
||||
console.log(`Total vision output cost: ${visionOutputCost.toFixed(4)}`);
|
||||
console.log(`Total TTS cost: ${ttsCost.toFixed(4)}`);
|
||||
console.log(`Total cost: ${totalCost.toFixed(4)}`);
|
||||
}
|
||||
118
src/index.ts
Normal file
118
src/index.ts
Normal file
@@ -0,0 +1,118 @@
|
||||
import 'dotenv/config';
|
||||
import { getDefaultConfig } from './config/config';
|
||||
import { createStats } from './config/stats';
|
||||
import { VisionProviderFactory } from './providers/vision/visionProviderFactory';
|
||||
import { TTSProviderFactory } from './providers/tts/ttsProviderFactory';
|
||||
import { generateAudioDescription } from './utils/processor';
|
||||
import { estimateCost } from './utils/costEstimator';
|
||||
import { loadConfigFromFile, saveConfigToFile } from './utils/configUtils';
|
||||
import { parseCommandLineArgs } from './cli/args';
|
||||
|
||||
// Export functions and types for use as a module
|
||||
export { generateAudioDescriptionFromOptions, generateAudioDescription } from './utils/processor';
|
||||
export { estimateCost } from './utils/costEstimator';
|
||||
export { getDefaultConfig, defaultConfig } from './config/config';
|
||||
export { VisionProviderFactory } from './providers/vision/visionProviderFactory';
|
||||
export { TTSProviderFactory } from './providers/tts/ttsProviderFactory';
|
||||
export { createStats, printStats } from './config/stats';
|
||||
export { loadConfigFromFile, saveConfigToFile } from './utils/configUtils';
|
||||
export type { Config } from './config/config';
|
||||
export type {
|
||||
ProcessingResult,
|
||||
CostBreakdown,
|
||||
Stats,
|
||||
VisionProvider,
|
||||
TTSProvider,
|
||||
AudioSegment,
|
||||
BatchContext,
|
||||
VisionResult,
|
||||
TTSResult,
|
||||
VisionProviderConfig,
|
||||
TTSProviderConfig,
|
||||
TTSOptions
|
||||
} from './interfaces';
|
||||
|
||||
// CLI entry point when run directly
|
||||
if (require.main === module) {
|
||||
main().catch(err => console.error('Unhandled error:', err));
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const argv = parseCommandLineArgs();
|
||||
|
||||
let config = getDefaultConfig();
|
||||
|
||||
if (argv.config) {
|
||||
const fileConfig = loadConfigFromFile(argv.config);
|
||||
config = { ...config, ...fileConfig };
|
||||
}
|
||||
|
||||
const argvObj = argv as unknown as Record<string, unknown>;
|
||||
Object.keys(argvObj).forEach(key => {
|
||||
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||||
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||||
argvObj[key] !== undefined) {
|
||||
(config as any)[key] = argvObj[key];
|
||||
}
|
||||
});
|
||||
|
||||
if (argv.visionModel) {
|
||||
if (!config.visionProviders[config.visionProvider]) {
|
||||
config.visionProviders[config.visionProvider] = { model: '' };
|
||||
}
|
||||
config.visionProviders[config.visionProvider].model = argv.visionModel;
|
||||
}
|
||||
|
||||
if (argv.ttsModel) {
|
||||
if (!config.ttsProviders[config.ttsProvider]) {
|
||||
config.ttsProviders[config.ttsProvider] = { model: '' };
|
||||
}
|
||||
config.ttsProviders[config.ttsProvider].model = argv.ttsModel;
|
||||
}
|
||||
|
||||
if (argv.ttsVoice) {
|
||||
if (!config.ttsProviders[config.ttsProvider]) {
|
||||
config.ttsProviders[config.ttsProvider] = { model: '', voice: '' };
|
||||
}
|
||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||
}
|
||||
|
||||
if (argv.ttsInstructions) {
|
||||
config.ttsInstructions = argv.ttsInstructions;
|
||||
}
|
||||
|
||||
if (argv.saveConfig) {
|
||||
saveConfigToFile(argv.saveConfig, config);
|
||||
}
|
||||
|
||||
if (argv._.length < 1) {
|
||||
console.error('Error: No video file specified');
|
||||
console.log('Usage: node dist/index.js <video_file_path> [options]');
|
||||
console.log('Use --help for more information');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const videoFilePath = String(argv._[0]);
|
||||
|
||||
if (argv.estimate) {
|
||||
try {
|
||||
const costBreakdown = await estimateCost(videoFilePath, config);
|
||||
console.log('\n=== COST ESTIMATION ===');
|
||||
console.log(JSON.stringify(costBreakdown, null, 2));
|
||||
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
||||
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
||||
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
||||
} catch (err) {
|
||||
console.error('Error estimating costs:', err);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
const stats = createStats();
|
||||
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||
await generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||
} catch (err) {
|
||||
console.error('Error generating audio description:', err);
|
||||
}
|
||||
}
|
||||
}
|
||||
113
src/interfaces/index.ts
Normal file
113
src/interfaces/index.ts
Normal file
@@ -0,0 +1,113 @@
|
||||
// Common interfaces for the application
|
||||
|
||||
// Vision provider interfaces
|
||||
export interface VisionUsage {
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
totalTokens: number;
|
||||
}
|
||||
|
||||
export interface VisionResult {
|
||||
description: string;
|
||||
usage: VisionUsage;
|
||||
}
|
||||
|
||||
export interface VisionProviderConfig {
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
maxTokens?: number;
|
||||
baseUrl?: string;
|
||||
}
|
||||
|
||||
export interface VisionProvider {
|
||||
describeImage(imagePath: string, prompt: string): Promise<VisionResult>;
|
||||
compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult>;
|
||||
describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: any,
|
||||
prompt: string
|
||||
): Promise<VisionResult>;
|
||||
}
|
||||
|
||||
// TTS provider interfaces
|
||||
export interface TTSResult {
|
||||
duration: number;
|
||||
cost: number;
|
||||
}
|
||||
|
||||
export interface TTSOptions {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
speedFactor?: number;
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
export interface TTSProviderConfig {
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
voice?: string;
|
||||
keyFilename?: string;
|
||||
}
|
||||
|
||||
export interface TTSProvider {
|
||||
textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options?: TTSOptions
|
||||
): Promise<TTSResult>;
|
||||
}
|
||||
|
||||
// Audio segment interface
|
||||
export interface AudioSegment {
|
||||
audioFile: string;
|
||||
startTime: number;
|
||||
duration: number;
|
||||
description: string;
|
||||
}
|
||||
|
||||
// Stats interface
|
||||
export interface Stats {
|
||||
totalFrames: number;
|
||||
totalBatches: number;
|
||||
totalVisionInputCost: number;
|
||||
totalVisionOutputCost: number;
|
||||
totalTTSCost: number;
|
||||
totalCost: number;
|
||||
}
|
||||
|
||||
// Batch context interface
|
||||
export interface BatchContext {
|
||||
lastDescription?: string;
|
||||
lastFramePaths?: string[];
|
||||
}
|
||||
|
||||
// Result interfaces
|
||||
export interface ProcessingResult {
|
||||
videoFile: string;
|
||||
audioDescriptionFile: string;
|
||||
}
|
||||
|
||||
export interface CostBreakdown {
|
||||
videoInfo: {
|
||||
duration: number;
|
||||
totalUnits: number;
|
||||
unitType: string;
|
||||
processingInterval: number;
|
||||
};
|
||||
providerInfo: {
|
||||
visionProvider: string;
|
||||
visionModel: string;
|
||||
ttsProvider: string;
|
||||
ttsModel: string;
|
||||
};
|
||||
apiCosts: {
|
||||
visionInput: string;
|
||||
visionOutput: string;
|
||||
tts: string;
|
||||
total: string;
|
||||
};
|
||||
estimates: {
|
||||
totalAPICallsToProviders: number;
|
||||
estimatedProcessingTimeMinutes: number;
|
||||
};
|
||||
}
|
||||
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import fs from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||
|
||||
export class ElevenLabsTTSProvider implements TTSProvider {
|
||||
private config: TTSProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
private lastRequestId: string | null = null;
|
||||
|
||||
constructor(config: TTSProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: 'https://api.elevenlabs.io/v1',
|
||||
headers: {
|
||||
'xi-api-key': config.apiKey,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options: TTSOptions = {}
|
||||
): Promise<TTSResult> {
|
||||
try {
|
||||
const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
|
||||
const model = options.model || this.config.model || 'eleven_multilingual_v2';
|
||||
const speedFactor = options.speedFactor || 1.0;
|
||||
|
||||
const requestBody: any = {
|
||||
text,
|
||||
model_id: model,
|
||||
voice_settings: {
|
||||
stability: 0.5,
|
||||
similarity_boost: 0.75,
|
||||
speed: speedFactor,
|
||||
use_speaker_boost: true
|
||||
}
|
||||
};
|
||||
|
||||
if (this.lastRequestId) {
|
||||
requestBody.previous_request_ids = [this.lastRequestId];
|
||||
}
|
||||
|
||||
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||
|
||||
const response = await this.axiosInstance.post(
|
||||
`/text-to-speech/${voice}`,
|
||||
requestBody,
|
||||
{
|
||||
params: { output_format: 'mp3_44100_128' },
|
||||
responseType: 'arraybuffer'
|
||||
}
|
||||
);
|
||||
|
||||
this.lastRequestId = response.headers['request-id'] || null;
|
||||
|
||||
const audioBuffer = Buffer.from(response.data);
|
||||
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||
|
||||
const cost = text.length;
|
||||
|
||||
if (speedFactor !== 1.0) {
|
||||
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
fs.unlinkSync(tempOutputPath);
|
||||
} else {
|
||||
fs.renameSync(tempOutputPath, outputPath);
|
||||
}
|
||||
|
||||
const audioDuration = getAudioDuration(outputPath);
|
||||
|
||||
return {
|
||||
duration: audioDuration,
|
||||
cost: cost
|
||||
};
|
||||
} catch (error: any) {
|
||||
if (error.response) {
|
||||
console.error(`ElevenLabs TTS error (${error.response.status}):`,
|
||||
Buffer.from(error.response.data).toString());
|
||||
} else {
|
||||
console.error('ElevenLabs TTS error:', error.message);
|
||||
}
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||
return {
|
||||
duration: 1,
|
||||
cost: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
import fs from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import { TextToSpeechClient } from '@google-cloud/text-to-speech';
|
||||
import { google } from '@google-cloud/text-to-speech/build/protos/protos';
|
||||
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||
|
||||
export class GoogleCloudTTSProvider implements TTSProvider {
|
||||
private config: TTSProviderConfig;
|
||||
private client: TextToSpeechClient;
|
||||
|
||||
constructor(config: TTSProviderConfig) {
|
||||
this.config = config;
|
||||
|
||||
const clientConfig: any = {
|
||||
apiKey: config.apiKey,
|
||||
fallback: true
|
||||
};
|
||||
|
||||
if (config.keyFilename) {
|
||||
clientConfig.keyFilename = config.keyFilename;
|
||||
}
|
||||
|
||||
this.client = new TextToSpeechClient(clientConfig);
|
||||
}
|
||||
|
||||
async textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options: TTSOptions = {}
|
||||
): Promise<TTSResult> {
|
||||
try {
|
||||
const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
|
||||
const model = options.model || this.config.model || 'chirp-hd';
|
||||
const speedFactor = options.speedFactor || 1.0;
|
||||
|
||||
const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
|
||||
input: { text },
|
||||
voice: {
|
||||
languageCode: this.extractLanguageCode(voice),
|
||||
name: voice
|
||||
},
|
||||
audioConfig: {
|
||||
audioEncoding: 'MP3',
|
||||
speakingRate: speedFactor
|
||||
}
|
||||
};
|
||||
|
||||
const [response] = await this.client.synthesizeSpeech(request);
|
||||
|
||||
if (!response.audioContent) {
|
||||
throw new Error('No audio content returned from Google Cloud TTS');
|
||||
}
|
||||
|
||||
const audioBuffer = response.audioContent instanceof Uint8Array
|
||||
? Buffer.from(response.audioContent)
|
||||
: Buffer.from(response.audioContent as any);
|
||||
|
||||
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||
|
||||
const cost = text.length;
|
||||
|
||||
if (speedFactor !== 1.0) {
|
||||
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
fs.unlinkSync(tempOutputPath);
|
||||
} else {
|
||||
fs.renameSync(tempOutputPath, outputPath);
|
||||
}
|
||||
|
||||
const audioDuration = getAudioDuration(outputPath);
|
||||
|
||||
return {
|
||||
duration: audioDuration,
|
||||
cost: cost
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('Google Cloud TTS error:', error.message);
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||
return {
|
||||
duration: 1,
|
||||
cost: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private extractLanguageCode(voiceName: string): string {
|
||||
const parts = voiceName.split('-');
|
||||
if (parts.length >= 2) {
|
||||
return `${parts[0]}-${parts[1]}`;
|
||||
}
|
||||
return 'en-US';
|
||||
}
|
||||
}
|
||||
4
src/providers/tts/index.ts
Normal file
4
src/providers/tts/index.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
export * from './ttsProviderFactory';
|
||||
export * from './openAITTSProvider';
|
||||
export * from './elevenLabsTTSProvider';
|
||||
export * from './googleCloudTTSProvider';
|
||||
82
src/providers/tts/openAITTSProvider.ts
Normal file
82
src/providers/tts/openAITTSProvider.ts
Normal file
@@ -0,0 +1,82 @@
|
||||
import fs from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import { OpenAI } from 'openai';
|
||||
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||
|
||||
/**
|
||||
* OpenAI TTS Provider Implementation
|
||||
*/
|
||||
export class OpenAITTSProvider implements TTSProvider {
|
||||
private config: TTSProviderConfig;
|
||||
private openai: OpenAI;
|
||||
|
||||
constructor(config: TTSProviderConfig) {
|
||||
this.config = config;
|
||||
this.openai = new OpenAI({
|
||||
apiKey: config.apiKey,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert text to speech
|
||||
* @param text - Text to convert to speech
|
||||
* @param outputPath - Output path for the audio file
|
||||
* @param options - Additional options
|
||||
* @returns Duration of the generated audio in seconds and cost
|
||||
*/
|
||||
async textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options: TTSOptions = {}
|
||||
): Promise<TTSResult> {
|
||||
try {
|
||||
// Get the options, with defaults from config
|
||||
const voice = options.voice || this.config.voice;
|
||||
const model = options.model || this.config.model;
|
||||
const speedFactor = options.speedFactor || 1.0;
|
||||
|
||||
// Generate the initial TTS output
|
||||
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||
|
||||
const mp3 = await this.openai.audio.speech.create({
|
||||
model: model,
|
||||
voice: voice as any,
|
||||
input: text,
|
||||
...(options.instructions ? { instructions: options.instructions } : {})
|
||||
});
|
||||
|
||||
// Cost calculation is based on character count
|
||||
const cost = text.length;
|
||||
|
||||
const buffer = Buffer.from(await mp3.arrayBuffer());
|
||||
fs.writeFileSync(tempOutputPath, buffer);
|
||||
|
||||
// Speed up the audio using FFmpeg if needed
|
||||
if (speedFactor !== 1.0) {
|
||||
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
// Clean up temporary file
|
||||
fs.unlinkSync(tempOutputPath);
|
||||
} else {
|
||||
// Just use the file as is
|
||||
fs.renameSync(tempOutputPath, outputPath);
|
||||
}
|
||||
|
||||
// Get actual audio duration for accurate timing
|
||||
const audioDuration = getAudioDuration(outputPath);
|
||||
|
||||
return {
|
||||
duration: audioDuration,
|
||||
cost: cost
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error generating speech:", error);
|
||||
// Create a silent audio file if TTS fails
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||
return {
|
||||
duration: 1,
|
||||
cost: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
31
src/providers/tts/ttsProviderFactory.ts
Normal file
31
src/providers/tts/ttsProviderFactory.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import { TTSProvider } from '../../interfaces';
|
||||
import { Config } from '../../config/config';
|
||||
import { OpenAITTSProvider } from './openAITTSProvider';
|
||||
import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
|
||||
import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';
|
||||
|
||||
/**
|
||||
* Factory for creating TTS providers
|
||||
*/
|
||||
export class TTSProviderFactory {
|
||||
static getProvider(config: Config): TTSProvider {
|
||||
const providerName = config.ttsProvider;
|
||||
const providerConfig = config.ttsProviders[providerName];
|
||||
|
||||
if (!providerConfig) {
|
||||
throw new Error(`TTS provider "${providerName}" not configured.`);
|
||||
}
|
||||
|
||||
switch (providerName) {
|
||||
case 'openai':
|
||||
return new OpenAITTSProvider(providerConfig);
|
||||
case 'elevenlabs':
|
||||
return new ElevenLabsTTSProvider(providerConfig);
|
||||
case 'google':
|
||||
return new GoogleCloudTTSProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`TTS provider "${providerName}" not implemented.`);
|
||||
}
|
||||
}
|
||||
}
|
||||
179
src/providers/vision/geminiVisionProvider.ts
Normal file
179
src/providers/vision/geminiVisionProvider.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
import fs from 'fs';
|
||||
import { GoogleGenerativeAI } from '@google/generative-ai';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
/**
|
||||
* Google Gemini Vision Provider Implementation
|
||||
*/
|
||||
export class GeminiVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private genAI: GoogleGenerativeAI;
|
||||
private model: any;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.genAI = new GoogleGenerativeAI(config.apiKey!);
|
||||
this.model = this.genAI.getGenerativeModel({ model: config.model });
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
// Create a file part for the image
|
||||
const imagePart = {
|
||||
inlineData: {
|
||||
data: imageData.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
// Generate content using Gemini
|
||||
const result = await this.model.generateContent([prompt, imagePart]);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Gemini doesn't provide token usage information in the same way as OpenAI
|
||||
// We'll estimate based on prompt length and response length
|
||||
const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing image with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe the differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
// Create file parts for both images
|
||||
const image1Part = {
|
||||
inlineData: {
|
||||
data: image1Data.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
const image2Part = {
|
||||
inlineData: {
|
||||
data: image2Data.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
// Generate content using Gemini with both images
|
||||
const result = await this.model.generateContent([prompt, image1Part, image2Part]);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Estimate token usage
|
||||
const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error comparing images with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences between these images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
// Create a prompt that includes context from the last batch if available
|
||||
let contextualPrompt = prompt;
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
|
||||
}
|
||||
|
||||
// Create content parts array starting with the prompt
|
||||
const contentParts: any[] = [contextualPrompt];
|
||||
|
||||
// Add all images to the content parts
|
||||
for (const imagePath of imagePaths) {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
contentParts.push({
|
||||
inlineData: {
|
||||
data: imageData.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Generate content using Gemini with all images
|
||||
const result = await this.model.generateContent(contentParts);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Estimate token usage
|
||||
const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing batch of images with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
5
src/providers/vision/index.ts
Normal file
5
src/providers/vision/index.ts
Normal file
@@ -0,0 +1,5 @@
|
||||
export * from './visionProviderFactory';
|
||||
export * from './openAIVisionProvider';
|
||||
export * from './geminiVisionProvider';
|
||||
export * from './ollamaVisionProvider';
|
||||
export * from './openRouterVisionProvider';
|
||||
151
src/providers/vision/ollamaVisionProvider.ts
Normal file
151
src/providers/vision/ollamaVisionProvider.ts
Normal file
@@ -0,0 +1,151 @@
|
||||
import fs from 'fs';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
/**
|
||||
* Ollama Vision Provider Implementation
|
||||
* See: https://github.com/ollama/ollama/blob/main/docs/api.md
|
||||
*/
|
||||
export class OllamaVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: config.baseUrl || "http://localhost:11434",
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [base64Image],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.1
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: {
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
totalTokens: 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeImage error:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path).toString('base64');
|
||||
const image2Data = fs.readFileSync(image2Path).toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [image1Data, image2Data],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama compareImages error:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch (optional)
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
let userPrompt = prompt;
|
||||
|
||||
// If there's context, prepend it. This helps maintain a storyline across batches.
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
userPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
|
||||
}
|
||||
|
||||
// Convert images to base64
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: userPrompt,
|
||||
images: imagesBase64,
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}
|
||||
}, {
|
||||
timeout: 120000 // Timeout in milliseconds, e.g., 5000 ms = 5 seconds
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeBatch error:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
193
src/providers/vision/openAIVisionProvider.ts
Normal file
193
src/providers/vision/openAIVisionProvider.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
import fs from 'fs';
|
||||
import { OpenAI } from 'openai';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
/**
|
||||
* OpenAI Vision Provider Implementation
|
||||
*/
|
||||
export class OpenAIVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private openai: OpenAI;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.openai = new OpenAI({
|
||||
apiKey: config.apiKey,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_completion_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing image:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe the differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
|
||||
const base64Image1 = image1Data.toString('base64');
|
||||
const base64Image2 = image2Data.toString('base64');
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image1}`
|
||||
}
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image2}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_completion_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error comparing images:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences between these images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
// Convert images to base64
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
// Build the messages array for the chat completion
|
||||
const messages: any[] = [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
// If we have some text context from the last batch, inject that as well
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
messages.unshift({
|
||||
role: "system",
|
||||
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||||
});
|
||||
}
|
||||
|
||||
// Append each image in the new batch
|
||||
imagesBase64.forEach(base64 => {
|
||||
messages[messages.length - 1].content.push({
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64}`
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
messages,
|
||||
max_completion_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing batch of images:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import fs from 'fs';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
export class OpenRouterVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${config.apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
|
||||
'X-Title': 'Aidio Description Generator'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter describeImage error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe this image.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
const base64Image1 = image1Data.toString('base64');
|
||||
const base64Image2 = image2Data.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter compareImages error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe the differences between these images.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
const messages: any[] = [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
messages.unshift({
|
||||
role: 'system',
|
||||
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||||
});
|
||||
}
|
||||
|
||||
imagesBase64.forEach(base64 => {
|
||||
messages[messages.length - 1].content.push({
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64}`
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
messages,
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe this batch of images.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
34
src/providers/vision/visionProviderFactory.ts
Normal file
34
src/providers/vision/visionProviderFactory.ts
Normal file
@@ -0,0 +1,34 @@
|
||||
import { VisionProvider } from '../../interfaces';
|
||||
import { Config } from '../../config/config';
|
||||
import { OpenAIVisionProvider } from './openAIVisionProvider';
|
||||
import { GeminiVisionProvider } from './geminiVisionProvider';
|
||||
import { OllamaVisionProvider } from './ollamaVisionProvider';
|
||||
import { OpenRouterVisionProvider } from './openRouterVisionProvider';
|
||||
|
||||
/**
|
||||
* Factory for creating vision AI providers
|
||||
*/
|
||||
export class VisionProviderFactory {
|
||||
static getProvider(config: Config): VisionProvider {
|
||||
const providerName = config.visionProvider;
|
||||
const providerConfig = config.visionProviders[providerName];
|
||||
|
||||
if (!providerConfig) {
|
||||
throw new Error(`Vision provider "${providerName}" not configured.`);
|
||||
}
|
||||
|
||||
switch (providerName) {
|
||||
case 'openai':
|
||||
return new OpenAIVisionProvider(providerConfig);
|
||||
case 'gemini':
|
||||
return new GeminiVisionProvider(providerConfig);
|
||||
case "ollama":
|
||||
return new OllamaVisionProvider(providerConfig);
|
||||
case 'openrouter':
|
||||
return new OpenRouterVisionProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
||||
}
|
||||
}
|
||||
}
|
||||
38
src/utils/configUtils.ts
Normal file
38
src/utils/configUtils.ts
Normal file
@@ -0,0 +1,38 @@
|
||||
import fs from 'fs';
|
||||
import { Config } from '../config/config';
|
||||
|
||||
/**
|
||||
* Load configuration from a JSON file
|
||||
* @param filePath - Path to the configuration file
|
||||
* @returns Configuration object
|
||||
*/
|
||||
export function loadConfigFromFile(filePath: string): Partial<Config> {
|
||||
try {
|
||||
const configFile = fs.readFileSync(filePath, 'utf8');
|
||||
const config = JSON.parse(configFile);
|
||||
console.log(`Loaded configuration from ${filePath}`);
|
||||
return config;
|
||||
} catch (error) {
|
||||
console.error(`Error loading config from ${filePath}:`, error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Save configuration to a JSON file
|
||||
* @param filePath - Path to save the configuration file
|
||||
* @param config - Configuration object to save
|
||||
*/
|
||||
export function saveConfigToFile(filePath: string, config: any): void {
|
||||
try {
|
||||
// Filter out non-configuration properties
|
||||
const configToSave = { ...config };
|
||||
const keysToExclude = ['_', '$0', 'video_file_path', 'estimate', 'config', 'saveConfig', 'help', 'version', 'h'];
|
||||
keysToExclude.forEach(key => delete configToSave[key]);
|
||||
|
||||
fs.writeFileSync(filePath, JSON.stringify(configToSave, null, 2), 'utf8');
|
||||
console.log(`Configuration saved to ${filePath}`);
|
||||
} catch (error) {
|
||||
console.error(`Error saving config to ${filePath}:`, error);
|
||||
}
|
||||
}
|
||||
182
src/utils/costEstimator.ts
Normal file
182
src/utils/costEstimator.ts
Normal file
@@ -0,0 +1,182 @@
|
||||
import { Config } from '../config/config';
|
||||
import { CostBreakdown } from '../interfaces';
|
||||
import { getVideoDuration } from './mediaUtils';
|
||||
|
||||
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
|
||||
|
||||
/**
|
||||
* Estimate the cost of generating audio descriptions for a video
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Cost estimation breakdown
|
||||
*/
|
||||
export async function estimateCost(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<CostBreakdown> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// Calculate the number of frames or batches to process
|
||||
let totalUnits: number;
|
||||
let unitCostMultiplier: number;
|
||||
let unitType: string;
|
||||
|
||||
if (settings.batchTimeMode) {
|
||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
unitCostMultiplier = settings.framesInBatch;
|
||||
unitType = "batches";
|
||||
} else {
|
||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
unitCostMultiplier = 1;
|
||||
unitType = "frames";
|
||||
}
|
||||
|
||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||
|
||||
// Pricing constants (per 1K units unless otherwise noted)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, TTSPricingModel>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': { input: 0.0025, output: 0.01 },
|
||||
'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
|
||||
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
|
||||
},
|
||||
gemini: {
|
||||
'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
|
||||
'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
|
||||
'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
|
||||
},
|
||||
openrouter: {
|
||||
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
|
||||
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030,
|
||||
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
|
||||
},
|
||||
elevenlabs: {
|
||||
'eleven_multilingual_v2': 0.30,
|
||||
'eleven_turbo_v2.5': 0.015
|
||||
},
|
||||
google: {
|
||||
'chirp-hd': 0.016,
|
||||
'wavenet': 0.016,
|
||||
'neural2': 0.016,
|
||||
'standard': 0.004
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Estimated token counts
|
||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
|
||||
const estimatedPromptTokens = 100;
|
||||
const estimatedOutputTokensPerUnit = 75;
|
||||
|
||||
// Estimated character counts for TTS
|
||||
const estimatedCharsPerDescription = 200;
|
||||
|
||||
// Calculate estimated costs for first unit
|
||||
const firstUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// For subsequent units, we need context (e.g., previous frames)
|
||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
|
||||
|
||||
const subsequentUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// Calculate total costs
|
||||
const totalVisionInputCost =
|
||||
firstUnitCost.visionInput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionInput;
|
||||
|
||||
const totalVisionOutputCost =
|
||||
firstUnitCost.visionOutput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionOutput;
|
||||
|
||||
const totalTTSCost =
|
||||
firstUnitCost.tts +
|
||||
(totalUnits - 1) * subsequentUnitCost.tts;
|
||||
|
||||
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||||
|
||||
// Create cost breakdown
|
||||
const costBreakdown: CostBreakdown = {
|
||||
videoInfo: {
|
||||
duration: videoDuration,
|
||||
totalUnits: totalUnits,
|
||||
unitType: unitType,
|
||||
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
|
||||
},
|
||||
providerInfo: {
|
||||
visionProvider: visionProvider,
|
||||
visionModel: visionModel,
|
||||
ttsProvider: ttsProvider,
|
||||
ttsModel: ttsModel
|
||||
},
|
||||
apiCosts: {
|
||||
visionInput: totalVisionInputCost.toFixed(4),
|
||||
visionOutput: totalVisionOutputCost.toFixed(4),
|
||||
tts: totalTTSCost.toFixed(4),
|
||||
total: totalCost.toFixed(4)
|
||||
},
|
||||
estimates: {
|
||||
totalAPICallsToProviders: totalUnits * 2,
|
||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
|
||||
}
|
||||
};
|
||||
|
||||
return costBreakdown;
|
||||
}
|
||||
|
||||
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
|
||||
if (!pricing) return 0;
|
||||
|
||||
if (typeof pricing === 'number') {
|
||||
// Per-character pricing: cost per 1000 characters
|
||||
return charCount * pricing / 1000;
|
||||
}
|
||||
|
||||
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
|
||||
// Rough estimate: 1 char ≈ 0.25 tokens for English text
|
||||
const estimatedInputTokens = charCount * 0.25;
|
||||
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
|
||||
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
|
||||
}
|
||||
4
src/utils/index.ts
Normal file
4
src/utils/index.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
export * from './mediaUtils';
|
||||
export * from './processor';
|
||||
export * from './costEstimator';
|
||||
export * from './configUtils';
|
||||
323
src/utils/mediaUtils.ts
Normal file
323
src/utils/mediaUtils.ts
Normal file
@@ -0,0 +1,323 @@
|
||||
import { execSync } from 'child_process';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { AudioSegment } from '../interfaces';
|
||||
import { Config } from '../config/config';
|
||||
|
||||
/**
|
||||
* Get the duration of a video file in seconds
|
||||
* @param videoFilePath - Path to the video file
|
||||
* @returns Duration in seconds
|
||||
*/
|
||||
export function getVideoDuration(videoFilePath: string): number {
|
||||
const result = execSync(`ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${videoFilePath}"`);
|
||||
return parseFloat(result.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture a frame from a video at a specific time position
|
||||
* @param videoFilePath - Path to the video file
|
||||
* @param timePosition - Time position in seconds
|
||||
* @param outputPath - Output path for the captured frame
|
||||
* @param lowQuality - If true, save screenshot in 360p resolution
|
||||
*/
|
||||
export function captureVideoFrame(
|
||||
videoFilePath: string,
|
||||
timePosition: number,
|
||||
outputPath: string,
|
||||
lowQuality: boolean = true
|
||||
): void {
|
||||
let command = `ffmpeg -v error -ss ${timePosition} -i "${videoFilePath}" -vframes 1 -q:v 2`;
|
||||
|
||||
// Add resolution scaling for low quality option
|
||||
if (lowQuality) {
|
||||
command += ' -vf scale=-1:360'; // Scale to 360p height while maintaining aspect ratio
|
||||
}
|
||||
|
||||
command += ` "${outputPath}" -y`;
|
||||
|
||||
execSync(command);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the duration of an audio file in seconds
|
||||
* @param audioFilePath - Path to the audio file
|
||||
* @returns Duration in seconds
|
||||
*/
|
||||
export function getAudioDuration(audioFilePath: string): number {
|
||||
const result = execSync(`ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${audioFilePath}"`);
|
||||
return parseFloat(result.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Combine audio segments into a single audio track using lossless intermediates
|
||||
* @param segments - Array of audio segment information
|
||||
* @param outputPath - Output path for the combined audio
|
||||
* @param videoDuration - Duration of the video in seconds
|
||||
* @param settings - Configuration settings
|
||||
*/
|
||||
export function combineAudioSegments(
|
||||
segments: AudioSegment[],
|
||||
outputPath: string,
|
||||
videoDuration: number,
|
||||
settings: Config
|
||||
): string | { commandFile: string } {
|
||||
console.log(`Combining ${segments.length} audio segments using lossless intermediates...`);
|
||||
|
||||
try {
|
||||
// Create a silent base track with the full video duration (always WAV)
|
||||
const silentBasePath = path.join(settings.tempDir, 'silent_base.wav');
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${silentBasePath}" -y`);
|
||||
|
||||
// Sort segments by start time to process them in order
|
||||
const sortedSegments = [...segments].sort((a, b) => a.startTime - b.startTime);
|
||||
|
||||
// Process one segment at a time, building up the audio file
|
||||
let currentAudioPath = silentBasePath;
|
||||
|
||||
for (let i = 0; i < sortedSegments.length; i++) {
|
||||
const segment = sortedSegments[i];
|
||||
const outputFile = path.join(settings.tempDir, `segment_${i}_output.wav`);
|
||||
|
||||
// Convert the segment to a standard WAV format first to avoid compatibility issues
|
||||
// and ensure we're always working with lossless audio
|
||||
const standardizedSegment = path.join(settings.tempDir, `segment_${i}_std.wav`);
|
||||
execSync(`ffmpeg -v error -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le "${standardizedSegment}" -y`);
|
||||
|
||||
// Calculate the position for this segment
|
||||
const timestamp = segment.startTime.toFixed(3);
|
||||
|
||||
// Create a filter script for this segment
|
||||
const filterPath = path.join(settings.tempDir, `filter_${i}.txt`);
|
||||
|
||||
// Use a filter that preserves the audio quality and positions correctly
|
||||
const filterContent =
|
||||
`[1:a]adelay=${Math.round(segment.startTime * 1000)}|${Math.round(segment.startTime * 1000)}[delayed];\n` +
|
||||
`[0:a][delayed]amix=inputs=2:duration=first:dropout_transition=0:normalize=0[out]`;
|
||||
|
||||
fs.writeFileSync(filterPath, filterContent);
|
||||
|
||||
// Execute FFmpeg with the filter script
|
||||
execSync(`ffmpeg -v error -i "${currentAudioPath}" -i "${standardizedSegment}" -filter_complex_script "${filterPath}" -map "[out]" -c:a pcm_s16le "${outputFile}" -y`);
|
||||
|
||||
// Clean up previous file if not the original
|
||||
if (currentAudioPath !== silentBasePath) {
|
||||
fs.unlinkSync(currentAudioPath);
|
||||
}
|
||||
|
||||
// Clean up standardized segment and filter
|
||||
fs.unlinkSync(standardizedSegment);
|
||||
fs.unlinkSync(filterPath);
|
||||
|
||||
// Update current audio path for next iteration
|
||||
currentAudioPath = outputFile;
|
||||
|
||||
console.log(`Added segment ${i + 1}/${sortedSegments.length} at position ${timestamp}s`);
|
||||
}
|
||||
|
||||
// Only at the very end, convert to the requested output format
|
||||
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||||
console.log(`Converting final lossless WAV to MP3: ${outputPath}`);
|
||||
execSync(`ffmpeg -v error -i "${currentAudioPath}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
} else {
|
||||
fs.copyFileSync(currentAudioPath, outputPath);
|
||||
}
|
||||
|
||||
console.log(`Audio description track created: ${outputPath}`);
|
||||
|
||||
// Clean up the last temp file
|
||||
if (currentAudioPath !== silentBasePath) {
|
||||
fs.unlinkSync(currentAudioPath);
|
||||
}
|
||||
|
||||
if (fs.existsSync(silentBasePath)) {
|
||||
fs.unlinkSync(silentBasePath);
|
||||
}
|
||||
|
||||
return outputPath;
|
||||
|
||||
} catch (error: any) {
|
||||
console.error("Error in lossless audio combination:", error.message);
|
||||
|
||||
try {
|
||||
console.log("Trying alternative approach with single-step filter...");
|
||||
|
||||
// Create a silent base track (always WAV)
|
||||
const silentBasePath = path.join(settings.tempDir, 'silent_base.wav');
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${silentBasePath}" -y`);
|
||||
|
||||
// Create a complex filter to overlay all audio files at their specific timestamps
|
||||
const filterScriptPath = path.join(settings.tempDir, 'overlay_filter.txt');
|
||||
let filterScript = '';
|
||||
|
||||
// Sort segments by start time
|
||||
const sortedSegments = [...segments].sort((a, b) => a.startTime - b.startTime);
|
||||
|
||||
// Standardize all segments to WAV first
|
||||
const standardizedSegments: { path: string; startTime: number }[] = [];
|
||||
for (let i = 0; i < sortedSegments.length; i++) {
|
||||
const segment = sortedSegments[i];
|
||||
const stdPath = path.join(settings.tempDir, `std_${i}.wav`);
|
||||
execSync(`ffmpeg -v error -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le "${stdPath}" -y`);
|
||||
standardizedSegments.push({
|
||||
path: stdPath,
|
||||
startTime: segment.startTime
|
||||
});
|
||||
}
|
||||
|
||||
// Build the FFmpeg command with all standardized inputs
|
||||
let ffmpegCmd = `ffmpeg -v error -i "${silentBasePath}" `;
|
||||
|
||||
// Add all standardized segments as inputs and create the filter script
|
||||
for (let i = 0; i < standardizedSegments.length; i++) {
|
||||
// Add as input
|
||||
ffmpegCmd += `-i "${standardizedSegments[i].path}" `;
|
||||
|
||||
// Add to filter script - the input index starts at 1 because 0 is the silent base
|
||||
const inputIndex = i + 1;
|
||||
const delay = Math.round(standardizedSegments[i].startTime * 1000);
|
||||
|
||||
// Add this input to filter script with proper delay
|
||||
filterScript += `[${inputIndex}:a]adelay=${delay}|${delay}[a${i}];\n`;
|
||||
}
|
||||
|
||||
// Complete the filter script to merge all streams
|
||||
filterScript += '[0:a]'; // Start with base
|
||||
for (let i = 0; i < standardizedSegments.length; i++) {
|
||||
filterScript += `[a${i}]`;
|
||||
}
|
||||
// Use amix with normalize=0 to preserve volumes
|
||||
filterScript += `amix=inputs=${standardizedSegments.length + 1}:normalize=0:duration=first[aout]`;
|
||||
|
||||
// Write the filter script
|
||||
fs.writeFileSync(filterScriptPath, filterScript);
|
||||
|
||||
// Use an intermediate WAV for the output to maintain quality
|
||||
const intermediatePath = path.join(settings.tempDir, 'intermediate_output.wav');
|
||||
|
||||
// Complete the FFmpeg command - always output to WAV first
|
||||
ffmpegCmd += `-filter_complex_script "${filterScriptPath}" -map "[aout]" -c:a pcm_s16le "${intermediatePath}" -y`;
|
||||
|
||||
// Execute the command
|
||||
execSync(ffmpegCmd);
|
||||
|
||||
// Convert to the requested format only at the end
|
||||
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||||
console.log(`Converting final audio to MP3...`);
|
||||
execSync(`ffmpeg -v error -i "${intermediatePath}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
} else {
|
||||
fs.copyFileSync(intermediatePath, outputPath);
|
||||
}
|
||||
|
||||
console.log(`Audio description track created with alternative method: ${outputPath}`);
|
||||
|
||||
// Clean up temp files
|
||||
if (fs.existsSync(filterScriptPath)) {
|
||||
fs.unlinkSync(filterScriptPath);
|
||||
}
|
||||
|
||||
if (fs.existsSync(silentBasePath)) {
|
||||
fs.unlinkSync(silentBasePath);
|
||||
}
|
||||
|
||||
if (fs.existsSync(intermediatePath)) {
|
||||
fs.unlinkSync(intermediatePath);
|
||||
}
|
||||
|
||||
// Clean up standardized segments
|
||||
standardizedSegments.forEach(seg => {
|
||||
if (fs.existsSync(seg.path)) {
|
||||
fs.unlinkSync(seg.path);
|
||||
}
|
||||
});
|
||||
|
||||
return outputPath;
|
||||
|
||||
} catch (secondError: any) {
|
||||
console.error("Alternative approach failed:", secondError.message);
|
||||
|
||||
// Last resort: Generate a command file with the proper syntax
|
||||
const cmdFilePath = outputPath.replace(/\.\w+$/, '_ffmpeg_cmd.sh');
|
||||
let cmdContent = `#!/bin/bash\n\n# FFmpeg command to combine audio segments\n\n`;
|
||||
|
||||
// Add commands to convert all segments to WAV first
|
||||
cmdContent += `# First convert all segments to standard WAV format\n`;
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
const segment = segments[i];
|
||||
const stdPath = `"${settings.tempDir}/std_${i}.wav"`;
|
||||
cmdContent += `ffmpeg -i "${segment.audioFile}" -ar 44100 -ac 2 -c:a pcm_s16le ${stdPath} -y\n`;
|
||||
}
|
||||
|
||||
// Create silent base
|
||||
cmdContent += `\n# Create silent base track\n`;
|
||||
cmdContent += `ffmpeg -f lavfi -i anullsrc=r=44100:cl=stereo -t ${videoDuration} -c:a pcm_s16le "${settings.tempDir}/silent_base.wav" -y\n\n`;
|
||||
|
||||
// Create filter file
|
||||
cmdContent += `# Create filter file\n`;
|
||||
cmdContent += `cat > "${settings.tempDir}/filter.txt" << EOL\n`;
|
||||
|
||||
// Add delay filters for each segment
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
const segment = segments[i];
|
||||
const delay = Math.round(segment.startTime * 1000);
|
||||
cmdContent += `[${i + 1}:a]adelay=${delay}|${delay}[a${i}];\n`;
|
||||
}
|
||||
|
||||
// Mix all streams
|
||||
cmdContent += `[0:a]`;
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
cmdContent += `[a${i}]`;
|
||||
}
|
||||
cmdContent += `amix=inputs=${segments.length + 1}:normalize=0:duration=first[aout]\nEOL\n\n`;
|
||||
|
||||
// Final command
|
||||
cmdContent += `# Run final FFmpeg command\n`;
|
||||
cmdContent += `ffmpeg -i "${settings.tempDir}/silent_base.wav" `;
|
||||
|
||||
// Add all segments as inputs
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
cmdContent += `-i "${settings.tempDir}/std_${i}.wav" `;
|
||||
}
|
||||
|
||||
// Complete command
|
||||
cmdContent += `-filter_complex_script "${settings.tempDir}/filter.txt" -map "[aout]" `;
|
||||
|
||||
if (path.extname(outputPath).toLowerCase() === '.mp3') {
|
||||
cmdContent += `-c:a libmp3lame -q:a 2 `;
|
||||
} else {
|
||||
cmdContent += `-c:a pcm_s16le `;
|
||||
}
|
||||
|
||||
cmdContent += `"${outputPath}" -y\n\n`;
|
||||
|
||||
// Add cleanup
|
||||
cmdContent += `# Clean up temp files\n`;
|
||||
cmdContent += `rm "${settings.tempDir}/silent_base.wav" "${settings.tempDir}/filter.txt"\n`;
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
cmdContent += `rm "${settings.tempDir}/std_${i}.wav"\n`;
|
||||
}
|
||||
|
||||
// Make the file executable
|
||||
fs.writeFileSync(cmdFilePath, cmdContent);
|
||||
execSync(`chmod +x "${cmdFilePath}"`);
|
||||
|
||||
console.log(`\nCreated executable script with proper FFmpeg commands: ${cmdFilePath}`);
|
||||
console.log(`Run this script to generate the audio file.`);
|
||||
|
||||
return {
|
||||
commandFile: cmdFilePath
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up temporary files
|
||||
* @param tempDir - Directory containing temporary files
|
||||
*/
|
||||
export function cleanupTempFiles(tempDir: string): void {
|
||||
const files = fs.readdirSync(tempDir);
|
||||
for (const file of files) {
|
||||
fs.unlinkSync(path.join(tempDir, file));
|
||||
}
|
||||
}
|
||||
360
src/utils/processor.ts
Normal file
360
src/utils/processor.ts
Normal file
@@ -0,0 +1,360 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import {
|
||||
VisionProvider,
|
||||
TTSProvider,
|
||||
AudioSegment,
|
||||
Stats,
|
||||
BatchContext,
|
||||
ProcessingResult
|
||||
} from '../interfaces';
|
||||
import { Config, getDefaultConfig } from '../config/config';
|
||||
import { printStats, createStats } from '../config/stats';
|
||||
import { VisionProviderFactory } from '../providers/vision/visionProviderFactory';
|
||||
import { TTSProviderFactory } from '../providers/tts/ttsProviderFactory';
|
||||
import {
|
||||
getVideoDuration,
|
||||
captureVideoFrame,
|
||||
combineAudioSegments
|
||||
} from './mediaUtils';
|
||||
|
||||
/**
|
||||
* High-level API: Generate audio description for a video with just options.
|
||||
* This internally creates providers and stats so callers don't need to.
|
||||
*
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Result of the operation
|
||||
*/
|
||||
export async function generateAudioDescriptionFromOptions(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<ProcessingResult> {
|
||||
const config = { ...getDefaultConfig(), ...options };
|
||||
|
||||
if (!fs.existsSync(config.tempDir)) {
|
||||
fs.mkdirSync(config.tempDir, { recursive: true });
|
||||
}
|
||||
if (!fs.existsSync(config.outputDir)) {
|
||||
fs.mkdirSync(config.outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||
const stats = createStats();
|
||||
|
||||
return generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio description for a video (low-level API requiring pre-initialized providers).
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param visionProvider - Vision provider instance
|
||||
* @param ttsProvider - TTS provider instance
|
||||
* @param options - Optional configuration overrides
|
||||
* @param stats - Stats object for tracking
|
||||
* @returns Result of the operation
|
||||
*/
|
||||
export async function generateAudioDescription(
|
||||
videoFilePath: string,
|
||||
visionProvider: VisionProvider,
|
||||
ttsProvider: TTSProvider,
|
||||
options: Partial<Config> = {},
|
||||
stats: Stats
|
||||
): Promise<ProcessingResult> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Ensure temporary and output directories exist
|
||||
if (!fs.existsSync(settings.tempDir)) {
|
||||
fs.mkdirSync(settings.tempDir, { recursive: true });
|
||||
}
|
||||
if (!fs.existsSync(settings.outputDir)) {
|
||||
fs.mkdirSync(settings.outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
stats.totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// If batchTimeMode is enabled, use the new approach
|
||||
if (settings.batchTimeMode) {
|
||||
return await generateAudioDescriptionBatch(
|
||||
videoFilePath,
|
||||
videoDuration,
|
||||
settings,
|
||||
visionProvider,
|
||||
ttsProvider,
|
||||
stats
|
||||
);
|
||||
}
|
||||
|
||||
// Calculate the number of frames to capture
|
||||
const totalFrames = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
console.log(`Will capture ${totalFrames} frames at ${settings.captureIntervalSeconds} second intervals`);
|
||||
|
||||
// Context window to store previous frames
|
||||
const frameContext: { index: number; path: string; timePosition: number }[] = [];
|
||||
|
||||
// Array to store audio segment information
|
||||
const audioSegments: AudioSegment[] = [];
|
||||
|
||||
// Track our current time position (will be adjusted for audio overlap)
|
||||
let currentTimePosition = 0;
|
||||
|
||||
// Track drift from the original schedule
|
||||
let timelineDrift = 0;
|
||||
const maxAllowableDrift = settings.captureIntervalSeconds * 2; // Maximum drift before warning
|
||||
|
||||
// Process each frame
|
||||
for (let i = 0; i < totalFrames; i++) {
|
||||
// Calculate the ideal time position based on the original schedule
|
||||
const idealTimePosition = i * settings.captureIntervalSeconds;
|
||||
|
||||
// Use the adjusted time position that accounts for previous audio durations
|
||||
const timePosition = currentTimePosition;
|
||||
|
||||
// Calculate drift from the original schedule
|
||||
timelineDrift = timePosition - idealTimePosition;
|
||||
|
||||
// Log if drift is becoming significant
|
||||
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||||
console.warn(`WARNING: Timeline drift at frame ${i} is ${timelineDrift.toFixed(2)} seconds.`);
|
||||
}
|
||||
|
||||
const frameFilePath = path.join(settings.tempDir, `frame_${i.toString().padStart(5, '0')}.jpg`);
|
||||
|
||||
// Capture frame at current time position (use the ideal time to capture the frame)
|
||||
captureVideoFrame(videoFilePath, idealTimePosition, frameFilePath);
|
||||
console.log(`Captured frame at ${idealTimePosition} seconds (scheduled at ${timePosition.toFixed(2)} seconds)`);
|
||||
|
||||
// Add current frame to context
|
||||
const currentFrame = {
|
||||
index: i,
|
||||
path: frameFilePath,
|
||||
timePosition
|
||||
};
|
||||
|
||||
frameContext.push(currentFrame);
|
||||
|
||||
// Keep context window at specified size
|
||||
if (frameContext.length > settings.contextWindowSize) {
|
||||
frameContext.shift();
|
||||
}
|
||||
|
||||
// Generate description
|
||||
let description: string;
|
||||
let usageStats: { inputTokens: number; outputTokens: number; totalTokens: number };
|
||||
|
||||
if (frameContext.length === 1) {
|
||||
// First frame - just describe what's in it
|
||||
const result = await visionProvider.describeImage(frameFilePath, settings.defaultPrompt);
|
||||
description = result.description;
|
||||
usageStats = result.usage;
|
||||
} else {
|
||||
// Compare with previous frame
|
||||
const previousFrame = frameContext[frameContext.length - 2];
|
||||
const result = await visionProvider.compareImages(previousFrame.path, frameFilePath, settings.changePrompt);
|
||||
description = result.description;
|
||||
usageStats = result.usage;
|
||||
}
|
||||
|
||||
// Update stats
|
||||
stats.totalVisionInputCost += usageStats.inputTokens;
|
||||
stats.totalVisionOutputCost += usageStats.outputTokens;
|
||||
stats.totalCost += usageStats.totalTokens;
|
||||
|
||||
console.log(`Description: ${description}`);
|
||||
|
||||
// Generate speech from description
|
||||
const audioFilePath = path.join(settings.tempDir, `audio_${i.toString().padStart(5, '0')}.mp3`);
|
||||
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
stats.totalTTSCost += ttsResult.cost;
|
||||
|
||||
console.log(`Audio duration: ${audioDuration} seconds`);
|
||||
|
||||
// Store segment information
|
||||
audioSegments.push({
|
||||
audioFile: audioFilePath,
|
||||
startTime: timePosition,
|
||||
duration: audioDuration,
|
||||
description
|
||||
});
|
||||
|
||||
// Update the time position for the next iteration
|
||||
// Add a small buffer (0.25 sec) between descriptions to prevent hard cuts
|
||||
const bufferTime = 0.25;
|
||||
currentTimePosition = timePosition + audioDuration + bufferTime;
|
||||
|
||||
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||||
const nextIdealPosition = (i + 1) * settings.captureIntervalSeconds;
|
||||
if (currentTimePosition < nextIdealPosition) {
|
||||
console.log(`Audio finished before next scheduled frame. Catching up with timeline.`);
|
||||
currentTimePosition = nextIdealPosition;
|
||||
timelineDrift = 0; // Reset drift since we've caught up
|
||||
}
|
||||
}
|
||||
|
||||
// Combine audio segments into final audio description track
|
||||
const outputAudioPath = path.join(settings.outputDir, `${path.basename(videoFilePath, path.extname(videoFilePath))}_description.mp3`);
|
||||
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||||
|
||||
// Clean up temporary files if desired
|
||||
// cleanupTempFiles(settings.tempDir);
|
||||
|
||||
console.log(`\nAudio description generated: ${outputAudioPath}`);
|
||||
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||||
printStats(stats, settings);
|
||||
|
||||
return {
|
||||
videoFile: videoFilePath,
|
||||
audioDescriptionFile: outputAudioPath
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate audio description using the "batch time" mode with overlap prevention.
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param videoDuration - Duration of the video in seconds
|
||||
* @param settings - The merged config and user options
|
||||
* @param visionProvider - The vision provider instance
|
||||
* @param ttsProvider - The TTS provider instance
|
||||
* @param stats - Stats object for tracking
|
||||
*/
|
||||
async function generateAudioDescriptionBatch(
|
||||
videoFilePath: string,
|
||||
videoDuration: number,
|
||||
settings: Config,
|
||||
visionProvider: VisionProvider,
|
||||
ttsProvider: TTSProvider,
|
||||
stats: Stats
|
||||
): Promise<ProcessingResult> {
|
||||
const totalBatches = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
console.log(`Using batchTimeMode. Total batches: ${totalBatches} (each covers ${settings.batchWindowDuration} sec)`);
|
||||
|
||||
// We'll hold the last batch's frames or last batch's description for context
|
||||
let lastBatchContext: BatchContext = {};
|
||||
|
||||
const audioSegments: AudioSegment[] = [];
|
||||
|
||||
// Track our current time position (will be adjusted for audio overlap)
|
||||
let currentTimePosition = 0;
|
||||
|
||||
// Track drift from the original schedule
|
||||
let timelineDrift = 0;
|
||||
const maxAllowableDrift = settings.batchWindowDuration * 0.5; // Maximum drift of 50% of batch window
|
||||
|
||||
for (let batchIndex = 0; batchIndex < totalBatches; batchIndex++) {
|
||||
// Calculate ideal batch timing based on configuration
|
||||
const idealBatchStart = batchIndex * settings.batchWindowDuration;
|
||||
|
||||
// Use adjusted time position that accounts for previous audio durations
|
||||
const batchStart = currentTimePosition;
|
||||
|
||||
// Calculate drift from the original schedule
|
||||
timelineDrift = batchStart - idealBatchStart;
|
||||
|
||||
// Log if drift is becoming significant
|
||||
if (Math.abs(timelineDrift) > maxAllowableDrift) {
|
||||
console.warn(`WARNING: Timeline drift at batch ${batchIndex} is ${timelineDrift.toFixed(2)} seconds.`);
|
||||
}
|
||||
|
||||
const batchEnd = idealBatchStart + settings.batchWindowDuration;
|
||||
if (batchEnd > videoDuration) break; // Safety check
|
||||
|
||||
console.log(`Processing batch #${batchIndex}: Original time window ${idealBatchStart}-${batchEnd} sec, scheduled at ${batchStart.toFixed(2)} sec`);
|
||||
|
||||
// Capture frames for this batch - use the ideal timing for frame capture
|
||||
const framePaths: string[] = [];
|
||||
for (let i = 0; i < settings.framesInBatch; i++) {
|
||||
const t = idealBatchStart + (i * settings.batchWindowDuration) / settings.framesInBatch;
|
||||
const frameFilePath = path.join(settings.tempDir, `batch_${batchIndex}_frame_${i}.jpg`);
|
||||
captureVideoFrame(videoFilePath, t, frameFilePath);
|
||||
framePaths.push(frameFilePath);
|
||||
}
|
||||
|
||||
// Use AI to describe this batch of frames, possibly providing some context
|
||||
const result = await visionProvider.describeBatch(
|
||||
framePaths,
|
||||
lastBatchContext,
|
||||
settings.batchPrompt
|
||||
);
|
||||
|
||||
const description = result.description;
|
||||
const usageStats = result.usage;
|
||||
|
||||
// Update stats
|
||||
stats.totalVisionInputCost += usageStats.inputTokens;
|
||||
stats.totalVisionOutputCost += usageStats.outputTokens;
|
||||
stats.totalCost += usageStats.totalTokens;
|
||||
|
||||
console.log(`Batch #${batchIndex} description:\n${description}\n`);
|
||||
|
||||
// Convert description to TTS
|
||||
const audioFilePath = path.join(settings.tempDir, `batch_audio_${batchIndex}.mp3`);
|
||||
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
stats.totalTTSCost += ttsResult.cost;
|
||||
|
||||
console.log(`Batch #${batchIndex} audio duration: ${audioDuration} seconds`);
|
||||
|
||||
// Store segment info with the adjusted start time
|
||||
audioSegments.push({
|
||||
audioFile: audioFilePath,
|
||||
startTime: batchStart,
|
||||
duration: audioDuration,
|
||||
description
|
||||
});
|
||||
|
||||
// Update the time position for the next iteration
|
||||
// Add a small buffer (0.5 sec) between descriptions
|
||||
const bufferTime = 0.5;
|
||||
currentTimePosition = batchStart + audioDuration + bufferTime;
|
||||
|
||||
// If we've fallen behind schedule, try to catch up (but don't skip content)
|
||||
const nextIdealPosition = (batchIndex + 1) * settings.batchWindowDuration;
|
||||
if (currentTimePosition < nextIdealPosition) {
|
||||
console.log(`Batch audio finished before next scheduled batch. Catching up with timeline.`);
|
||||
currentTimePosition = nextIdealPosition;
|
||||
timelineDrift = 0; // Reset drift since we've caught up
|
||||
}
|
||||
|
||||
// Update lastBatchContext so the next batch can keep track of what's previously seen
|
||||
lastBatchContext = {
|
||||
lastDescription: description,
|
||||
lastFramePaths: framePaths.slice(-2) // keep the last 2 frames from this batch
|
||||
};
|
||||
}
|
||||
|
||||
// Combine all the audio segments into one track
|
||||
const outputAudioPath = path.join(
|
||||
settings.outputDir,
|
||||
`${path.basename(videoFilePath, path.extname(videoFilePath))}_description_batch.mp3`
|
||||
);
|
||||
combineAudioSegments(audioSegments, outputAudioPath, videoDuration, settings);
|
||||
|
||||
console.log(`\nBatch audio description generated: ${outputAudioPath}`);
|
||||
console.log(`To play with video, use: ffplay -i ${videoFilePath} -i ${outputAudioPath} -map 0:v -map 1:a`);
|
||||
printStats(stats, settings);
|
||||
|
||||
return {
|
||||
videoFile: videoFilePath,
|
||||
audioDescriptionFile: outputAudioPath
|
||||
};
|
||||
}
|
||||
17
tsconfig.json
Normal file
17
tsconfig.json
Normal file
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"module": "CommonJS",
|
||||
"moduleResolution": "node",
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"outDir": "./dist",
|
||||
"declaration": true,
|
||||
"sourceMap": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"skipLibCheck": true,
|
||||
"resolveJsonModule": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
Reference in New Issue
Block a user