Modernize codebase
This commit is contained in:
2
index.js
2
index.js
@@ -21,7 +21,7 @@ const defaultConfig = {
|
|||||||
|
|
||||||
// Vision AI settings
|
// Vision AI settings
|
||||||
visionProvider: "gemini",
|
visionProvider: "gemini",
|
||||||
visionModel: "gemini-2.0-flash",
|
visionModel: "gemini-3.0-flash",
|
||||||
visionProviders: {
|
visionProviders: {
|
||||||
openai: {
|
openai: {
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
|
|||||||
6552
package-lock.json
generated
6552
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
10
package.json
10
package.json
@@ -4,10 +4,16 @@
|
|||||||
"description": "Generate AI-powered audio descriptions for video content",
|
"description": "Generate AI-powered audio descriptions for video content",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"types": "dist/index.d.ts",
|
"types": "dist/index.d.ts",
|
||||||
|
"bin": {
|
||||||
|
"aidio-desc": "./dist/cli/index.js"
|
||||||
|
},
|
||||||
|
"files": [
|
||||||
|
"dist"
|
||||||
|
],
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "tsc",
|
"build": "tsc",
|
||||||
"start": "node dist/index.js",
|
"start": "node dist/cli/index.js",
|
||||||
"dev": "ts-node src/index.ts",
|
"dev": "ts-node src/cli/index.ts",
|
||||||
"test": "jest",
|
"test": "jest",
|
||||||
"lint": "eslint src/**/*.ts",
|
"lint": "eslint src/**/*.ts",
|
||||||
"prepublishOnly": "npm run build"
|
"prepublishOnly": "npm run build"
|
||||||
|
|||||||
@@ -1,11 +1,36 @@
|
|||||||
import yargs from 'yargs/yargs';
|
import yargs from 'yargs/yargs';
|
||||||
import { hideBin } from 'yargs/helpers';
|
import { hideBin } from 'yargs/helpers';
|
||||||
|
|
||||||
|
export interface CLIArgs {
|
||||||
|
_: (string | number)[];
|
||||||
|
$0: string;
|
||||||
|
video_file_path?: string;
|
||||||
|
captureIntervalSeconds?: number;
|
||||||
|
contextWindowSize?: number;
|
||||||
|
visionProvider?: string;
|
||||||
|
visionModel?: string;
|
||||||
|
ttsProvider?: string;
|
||||||
|
ttsModel?: string;
|
||||||
|
ttsVoice?: string;
|
||||||
|
ttsSpeedFactor?: number;
|
||||||
|
outputDir?: string;
|
||||||
|
tempDir?: string;
|
||||||
|
batchTimeMode?: boolean;
|
||||||
|
batchWindowDuration?: number;
|
||||||
|
framesInBatch?: number;
|
||||||
|
defaultPrompt?: string;
|
||||||
|
changePrompt?: string;
|
||||||
|
batchPrompt?: string;
|
||||||
|
estimate?: boolean;
|
||||||
|
config?: string;
|
||||||
|
saveConfig?: string;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse command line arguments
|
* Parse command line arguments
|
||||||
*/
|
*/
|
||||||
export function parseCommandLineArgs() {
|
export function parseCommandLineArgs(): CLIArgs {
|
||||||
return yargs(hideBin(process.argv))
|
const parsed = yargs(hideBin(process.argv))
|
||||||
.usage('Usage: $0 <video_file_path> [options]')
|
.usage('Usage: $0 <video_file_path> [options]')
|
||||||
.positional('video_file_path', {
|
.positional('video_file_path', {
|
||||||
describe: 'Path to the input video file',
|
describe: 'Path to the input video file',
|
||||||
@@ -21,7 +46,6 @@ export function parseCommandLineArgs() {
|
|||||||
describe: 'Number of frames to keep in context',
|
describe: 'Number of frames to keep in context',
|
||||||
type: 'number'
|
type: 'number'
|
||||||
})
|
})
|
||||||
// Vision provider options
|
|
||||||
.option('visionProvider', {
|
.option('visionProvider', {
|
||||||
describe: 'Provider to use for vision AI',
|
describe: 'Provider to use for vision AI',
|
||||||
type: 'string'
|
type: 'string'
|
||||||
@@ -30,7 +54,6 @@ export function parseCommandLineArgs() {
|
|||||||
describe: 'Model to use for vision AI',
|
describe: 'Model to use for vision AI',
|
||||||
type: 'string'
|
type: 'string'
|
||||||
})
|
})
|
||||||
// TTS provider options
|
|
||||||
.option('ttsProvider', {
|
.option('ttsProvider', {
|
||||||
describe: 'Provider to use for text-to-speech',
|
describe: 'Provider to use for text-to-speech',
|
||||||
type: 'string'
|
type: 'string'
|
||||||
@@ -107,5 +130,7 @@ export function parseCommandLineArgs() {
|
|||||||
.example('$0 video.mp4 --estimate', 'Only estimate the processing cost')
|
.example('$0 video.mp4 --estimate', 'Only estimate the processing cost')
|
||||||
.example('$0 video.mp4 --config myconfig.json', 'Use settings from a config file')
|
.example('$0 video.mp4 --config myconfig.json', 'Use settings from a config file')
|
||||||
.example('$0 video.mp4 --saveConfig myconfig.json', 'Save current settings to a config file')
|
.example('$0 video.mp4 --saveConfig myconfig.json', 'Save current settings to a config file')
|
||||||
.argv;
|
.argv as unknown as CLIArgs;
|
||||||
|
|
||||||
|
return parsed;
|
||||||
}
|
}
|
||||||
90
src/cli/index.ts
Normal file
90
src/cli/index.ts
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import 'dotenv/config';
|
||||||
|
import { getDefaultConfig } from '../config/config';
|
||||||
|
import { createStats } from '../config/stats';
|
||||||
|
import { VisionProviderFactory } from '../providers/vision/visionProviderFactory';
|
||||||
|
import { TTSProviderFactory } from '../providers/tts/ttsProviderFactory';
|
||||||
|
import { generateAudioDescription } from '../utils/processor';
|
||||||
|
import { estimateCost } from '../utils/costEstimator';
|
||||||
|
import { loadConfigFromFile, saveConfigToFile } from '../utils/configUtils';
|
||||||
|
import { parseCommandLineArgs } from './args';
|
||||||
|
import { Config } from '../config/config';
|
||||||
|
|
||||||
|
async function main(): Promise<void> {
|
||||||
|
const argv = parseCommandLineArgs();
|
||||||
|
|
||||||
|
let config: Config = getDefaultConfig();
|
||||||
|
|
||||||
|
if (argv.config) {
|
||||||
|
const fileConfig = loadConfigFromFile(argv.config);
|
||||||
|
config = { ...config, ...fileConfig };
|
||||||
|
}
|
||||||
|
|
||||||
|
const argvObj = argv as unknown as Record<string, unknown>;
|
||||||
|
Object.keys(argvObj).forEach(key => {
|
||||||
|
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||||||
|
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||||||
|
argvObj[key] !== undefined) {
|
||||||
|
(config as any)[key] = argvObj[key];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (argv.visionModel) {
|
||||||
|
if (!config.visionProviders[config.visionProvider]) {
|
||||||
|
config.visionProviders[config.visionProvider] = { model: '' };
|
||||||
|
}
|
||||||
|
config.visionProviders[config.visionProvider].model = argv.visionModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.ttsModel) {
|
||||||
|
if (!config.ttsProviders[config.ttsProvider]) {
|
||||||
|
config.ttsProviders[config.ttsProvider] = { model: '' };
|
||||||
|
}
|
||||||
|
config.ttsProviders[config.ttsProvider].model = argv.ttsModel;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.ttsVoice) {
|
||||||
|
if (!config.ttsProviders[config.ttsProvider]) {
|
||||||
|
config.ttsProviders[config.ttsProvider] = { model: '', voice: '' };
|
||||||
|
}
|
||||||
|
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.saveConfig) {
|
||||||
|
saveConfigToFile(argv.saveConfig, config);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv._.length < 1) {
|
||||||
|
console.error('Error: No video file specified');
|
||||||
|
console.log('Usage: node script.js <video_file_path> [options]');
|
||||||
|
console.log('Use --help for more information');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const videoFilePath = String(argv._[0]);
|
||||||
|
|
||||||
|
if (argv.estimate) {
|
||||||
|
try {
|
||||||
|
const costBreakdown = await estimateCost(videoFilePath, config);
|
||||||
|
console.log('\n=== COST ESTIMATION ===');
|
||||||
|
console.log(JSON.stringify(costBreakdown, null, 2));
|
||||||
|
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
||||||
|
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
||||||
|
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error estimating costs:', err);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
const stats = createStats();
|
||||||
|
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||||
|
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||||
|
await generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Error generating audio description:', err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (require.main === module) {
|
||||||
|
main().catch(err => console.error('Unhandled error:', err));
|
||||||
|
}
|
||||||
@@ -30,54 +30,61 @@ export interface Config {
|
|||||||
framesInBatch: number;
|
framesInBatch: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Default configuration options
|
/**
|
||||||
export const defaultConfig: Config = {
|
* Get default configuration options.
|
||||||
captureIntervalSeconds: 10,
|
* Uses a function so that process.env is read at call time
|
||||||
contextWindowSize: 5,
|
* (after dotenv has been loaded), not at module import time.
|
||||||
defaultPrompt: "Describe this frame from a video in 1-2 sentences for someone who cannot see it. Focus on key visual elements. Avoid using terms like 'in this frame', simply describe the actual frame. Keep sentences short and concise, as this will be used to generate an audio track which is overlayed on the video.",
|
*/
|
||||||
changePrompt: "Describe what has changed between these frames in 1-2 sentences for someone who cannot see the video. Focus on significant visual changes only. Avoid talking about meta information such as 'in this frame', or 'the significant change is', and merely describe the actual change taking place. Only describe the changes relevant to the last frame. The previous frames are attached for you to build context and build situational awareness. Keep it short and concise, as your text will be used to generate audio description tracks to be played with the video.",
|
export function getDefaultConfig(): Config {
|
||||||
batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
|
return {
|
||||||
|
captureIntervalSeconds: 10,
|
||||||
// Vision AI settings
|
contextWindowSize: 5,
|
||||||
visionProvider: "gemini",
|
defaultPrompt: "Describe this frame from a video in 1-2 sentences for someone who cannot see it. Focus on key visual elements. Avoid using terms like 'in this frame', simply describe the actual frame. Keep sentences short and concise, as this will be used to generate an audio track which is overlayed on the video.",
|
||||||
visionModel: "gemini-2.0-flash",
|
changePrompt: "Describe what has changed between these frames in 1-2 sentences for someone who cannot see the video. Focus on significant visual changes only. Avoid talking about meta information such as 'in this frame', or 'the significant change is', and merely describe the actual change taking place. Only describe the changes relevant to the last frame. The previous frames are attached for you to build context and build situational awareness. Keep it short and concise, as your text will be used to generate audio description tracks to be played with the video.",
|
||||||
visionProviders: {
|
batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
|
||||||
openai: {
|
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
// Vision AI settings
|
||||||
model: "gpt-4o",
|
visionProvider: "openai",
|
||||||
maxTokens: 300
|
visionModel: "gpt-5.4-mini",
|
||||||
|
visionProviders: {
|
||||||
|
openai: {
|
||||||
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
|
model: "gpt-5.4-mini",
|
||||||
|
maxTokens: 300
|
||||||
|
},
|
||||||
|
gemini: {
|
||||||
|
apiKey: process.env.GOOGLE_API_KEY,
|
||||||
|
model: "gemini-2.0-flash",
|
||||||
|
maxTokens: 300
|
||||||
|
},
|
||||||
|
ollama: {
|
||||||
|
baseUrl: "http://localhost:11434",
|
||||||
|
model: "gemma3:12b",
|
||||||
|
maxTokens: 3000
|
||||||
|
}
|
||||||
},
|
},
|
||||||
gemini: {
|
|
||||||
apiKey: process.env.GOOGLE_API_KEY,
|
// TTS settings
|
||||||
model: "gemini-2.0-flash",
|
ttsProvider: "openai",
|
||||||
maxTokens: 300
|
ttsVoice: "alloy",
|
||||||
|
ttsSpeedFactor: 1.5,
|
||||||
|
ttsProviders: {
|
||||||
|
openai: {
|
||||||
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
|
model: "tts-1-hd",
|
||||||
|
voice: "alloy"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
ollama: {
|
|
||||||
// Example config; adjust to match your local Ollama setup
|
// Video processing settings
|
||||||
baseUrl: "http://localhost:11434", // or wherever Ollama is hosted
|
outputDir: "./desc/output/",
|
||||||
model: "gemma3:12b",
|
tempDir: "./desc/tmp/",
|
||||||
maxTokens: 3000
|
batchTimeMode: true,
|
||||||
}
|
batchWindowDuration: 15,
|
||||||
// Add other vision providers here
|
framesInBatch: 10,
|
||||||
},
|
};
|
||||||
|
}
|
||||||
// TTS settings
|
|
||||||
ttsProvider: "openai",
|
// Keep a static export alias for backward compatibility
|
||||||
ttsVoice: "alloy", // Voice option for TTS
|
// (but callers should prefer getDefaultConfig() for correct env loading)
|
||||||
ttsSpeedFactor: 1.5, // Speed up audio by 50%
|
export const defaultConfig = getDefaultConfig();
|
||||||
ttsProviders: {
|
|
||||||
openai: {
|
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
|
||||||
model: "tts-1-hd",
|
|
||||||
voice: "alloy"
|
|
||||||
},
|
|
||||||
// Add other TTS providers here
|
|
||||||
},
|
|
||||||
|
|
||||||
// Video processing settings
|
|
||||||
outputDir: "./desc/output/",
|
|
||||||
tempDir: "./desc/tmp/",
|
|
||||||
batchTimeMode: true, // Whether to use the new batch time mode
|
|
||||||
batchWindowDuration: 15, // How many seconds each batch covers
|
|
||||||
framesInBatch: 10, // How many frames to capture within each batch
|
|
||||||
};
|
|
||||||
@@ -1,2 +1,2 @@
|
|||||||
export * from './config';
|
export { Config, getDefaultConfig, defaultConfig } from './config';
|
||||||
export * from './stats';
|
export { createStats, printStats } from './stats';
|
||||||
@@ -11,47 +11,30 @@ export const createStats = (): Stats => ({
|
|||||||
totalCost: 0
|
totalCost: 0
|
||||||
});
|
});
|
||||||
|
|
||||||
// Pricing interface
|
|
||||||
interface PricingData {
|
|
||||||
vision: {
|
|
||||||
[provider: string]: {
|
|
||||||
[model: string]: {
|
|
||||||
input: number;
|
|
||||||
output: number;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
tts: {
|
|
||||||
[provider: string]: {
|
|
||||||
[model: string]: number
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pricing constants (as of March 2025)
|
// Pricing constants (as of March 2025)
|
||||||
const pricing: PricingData = {
|
const pricing: {
|
||||||
|
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||||
|
tts: Record<string, Record<string, number>>;
|
||||||
|
} = {
|
||||||
vision: {
|
vision: {
|
||||||
openai: {
|
openai: {
|
||||||
'gpt-4o': {
|
'gpt-4o': {
|
||||||
input: 0.0025, // per 1K input tokens
|
input: 0.0025,
|
||||||
output: 0.01 // per 1K output tokens
|
output: 0.01
|
||||||
}
|
}
|
||||||
// Add other OpenAI models here
|
|
||||||
},
|
},
|
||||||
gemini: {
|
gemini: {
|
||||||
'gemini-pro-vision': {
|
'gemini-pro-vision': {
|
||||||
input: 0.0025, // per 1K input tokens
|
input: 0.0025,
|
||||||
output: 0.0025 // per 1K output tokens
|
output: 0.0025
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add other vision providers here
|
|
||||||
},
|
},
|
||||||
tts: {
|
tts: {
|
||||||
openai: {
|
openai: {
|
||||||
'tts-1': 0.015, // per 1K characters
|
'tts-1': 0.015,
|
||||||
'tts-1-hd': 0.030 // per 1K characters
|
'tts-1-hd': 0.030
|
||||||
}
|
}
|
||||||
// Add other TTS providers here
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
232
src/index.ts
232
src/index.ts
@@ -1,118 +1,114 @@
|
|||||||
import dotenv from 'dotenv';
|
import 'dotenv/config';
|
||||||
import { defaultConfig } from './config/config';
|
import { getDefaultConfig } from './config/config';
|
||||||
import { createStats } from './config/stats';
|
import { createStats } from './config/stats';
|
||||||
import { VisionProviderFactory } from './providers/vision/visionProviderFactory';
|
import { VisionProviderFactory } from './providers/vision/visionProviderFactory';
|
||||||
import { TTSProviderFactory } from './providers/tts/ttsProviderFactory';
|
import { TTSProviderFactory } from './providers/tts/ttsProviderFactory';
|
||||||
import { generateAudioDescription } from './utils/processor';
|
import { generateAudioDescription } from './utils/processor';
|
||||||
import { estimateCost } from './utils/costEstimator';
|
import { estimateCost } from './utils/costEstimator';
|
||||||
import { loadConfigFromFile, saveConfigToFile } from './utils/configUtils';
|
import { loadConfigFromFile, saveConfigToFile } from './utils/configUtils';
|
||||||
import { parseCommandLineArgs } from './cli/args';
|
import { parseCommandLineArgs } from './cli/args';
|
||||||
import { ProcessingResult, CostBreakdown } from './interfaces';
|
|
||||||
|
// Export functions and types for use as a module
|
||||||
// Load environment variables
|
export { generateAudioDescriptionFromOptions, generateAudioDescription } from './utils/processor';
|
||||||
dotenv.config();
|
export { estimateCost } from './utils/costEstimator';
|
||||||
|
export { getDefaultConfig, defaultConfig } from './config/config';
|
||||||
// Main execution when run directly
|
export { VisionProviderFactory } from './providers/vision/visionProviderFactory';
|
||||||
async function main(): Promise<void> {
|
export { TTSProviderFactory } from './providers/tts/ttsProviderFactory';
|
||||||
// Parse command line arguments
|
export { createStats, printStats } from './config/stats';
|
||||||
const argv = parseCommandLineArgs();
|
export { loadConfigFromFile, saveConfigToFile } from './utils/configUtils';
|
||||||
|
export type { Config } from './config/config';
|
||||||
// Start with default config
|
export type {
|
||||||
let config = { ...defaultConfig };
|
ProcessingResult,
|
||||||
|
CostBreakdown,
|
||||||
// If a config file is specified, load it
|
Stats,
|
||||||
if (argv.config) {
|
VisionProvider,
|
||||||
const fileConfig = loadConfigFromFile(argv.config);
|
TTSProvider,
|
||||||
config = { ...config, ...fileConfig };
|
AudioSegment,
|
||||||
}
|
BatchContext,
|
||||||
|
VisionResult,
|
||||||
// Override with any command line arguments
|
TTSResult,
|
||||||
Object.keys(argv).forEach(key => {
|
VisionProviderConfig,
|
||||||
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
TTSProviderConfig,
|
||||||
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
TTSOptions
|
||||||
argv[key as keyof typeof argv] !== undefined) {
|
} from './interfaces';
|
||||||
(config as any)[key] = argv[key as keyof typeof argv];
|
|
||||||
}
|
// CLI entry point when run directly
|
||||||
});
|
if (require.main === module) {
|
||||||
|
main().catch(err => console.error('Unhandled error:', err));
|
||||||
// Handle nested provider configurations
|
}
|
||||||
if (argv.visionModel) {
|
|
||||||
if (!config.visionProviders[config.visionProvider]) {
|
async function main(): Promise<void> {
|
||||||
config.visionProviders[config.visionProvider] = { model: '' };
|
const argv = parseCommandLineArgs();
|
||||||
}
|
|
||||||
config.visionProviders[config.visionProvider].model = argv.visionModel as string;
|
let config = getDefaultConfig();
|
||||||
}
|
|
||||||
|
if (argv.config) {
|
||||||
if (argv.ttsModel) {
|
const fileConfig = loadConfigFromFile(argv.config);
|
||||||
if (!config.ttsProviders[config.ttsProvider]) {
|
config = { ...config, ...fileConfig };
|
||||||
config.ttsProviders[config.ttsProvider] = { model: '' };
|
}
|
||||||
}
|
|
||||||
config.ttsProviders[config.ttsProvider].model = argv.ttsModel as string;
|
const argvObj = argv as unknown as Record<string, unknown>;
|
||||||
}
|
Object.keys(argvObj).forEach(key => {
|
||||||
|
if (key !== '_' && key !== '$0' && key !== 'config' && key !== 'saveConfig' &&
|
||||||
if (argv.ttsVoice) {
|
key !== 'estimate' && key !== 'help' && key !== 'version' &&
|
||||||
if (!config.ttsProviders[config.ttsProvider]) {
|
argvObj[key] !== undefined) {
|
||||||
config.ttsProviders[config.ttsProvider] = { model: '', voice: '' };
|
(config as any)[key] = argvObj[key];
|
||||||
}
|
}
|
||||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice as string;
|
});
|
||||||
}
|
|
||||||
|
if (argv.visionModel) {
|
||||||
// Save configuration if requested
|
if (!config.visionProviders[config.visionProvider]) {
|
||||||
if (argv.saveConfig) {
|
config.visionProviders[config.visionProvider] = { model: '' };
|
||||||
saveConfigToFile(argv.saveConfig as string, config);
|
}
|
||||||
}
|
config.visionProviders[config.visionProvider].model = argv.visionModel;
|
||||||
|
}
|
||||||
// Check if a video file is provided
|
|
||||||
if (argv._.length < 1) {
|
if (argv.ttsModel) {
|
||||||
console.error('Error: No video file specified');
|
if (!config.ttsProviders[config.ttsProvider]) {
|
||||||
console.log('Usage: node script.js <video_file_path> [options]');
|
config.ttsProviders[config.ttsProvider] = { model: '' };
|
||||||
console.log('Use --help for more information');
|
}
|
||||||
process.exit(1);
|
config.ttsProviders[config.ttsProvider].model = argv.ttsModel;
|
||||||
}
|
}
|
||||||
|
|
||||||
const videoFilePath = argv._[0] as string;
|
if (argv.ttsVoice) {
|
||||||
|
if (!config.ttsProviders[config.ttsProvider]) {
|
||||||
// Run estimation or full processing
|
config.ttsProviders[config.ttsProvider] = { model: '', voice: '' };
|
||||||
if (argv.estimate) {
|
}
|
||||||
try {
|
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||||
const costBreakdown = await estimateCost(videoFilePath, config);
|
}
|
||||||
console.log('\n=== COST ESTIMATION ===');
|
|
||||||
console.log(JSON.stringify(costBreakdown, null, 2));
|
if (argv.saveConfig) {
|
||||||
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
saveConfigToFile(argv.saveConfig, config);
|
||||||
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
}
|
||||||
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
|
||||||
} catch (err) {
|
if (argv._.length < 1) {
|
||||||
console.error('Error estimating costs:', err);
|
console.error('Error: No video file specified');
|
||||||
}
|
console.log('Usage: node dist/index.js <video_file_path> [options]');
|
||||||
} else {
|
console.log('Use --help for more information');
|
||||||
// Run the full generator
|
process.exit(1);
|
||||||
try {
|
}
|
||||||
// Initialize the stats object
|
|
||||||
const stats = createStats();
|
const videoFilePath = String(argv._[0]);
|
||||||
|
|
||||||
// Initialize providers
|
if (argv.estimate) {
|
||||||
const visionProvider = VisionProviderFactory.getProvider(config);
|
try {
|
||||||
const ttsProvider = TTSProviderFactory.getProvider(config);
|
const costBreakdown = await estimateCost(videoFilePath, config);
|
||||||
|
console.log('\n=== COST ESTIMATION ===');
|
||||||
await generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
console.log(JSON.stringify(costBreakdown, null, 2));
|
||||||
} catch (err) {
|
console.log(`\nEstimated total cost: ${costBreakdown.apiCosts.total}`);
|
||||||
console.error('Error generating audio description:', err);
|
console.log(`Estimated processing time: ${costBreakdown.estimates.estimatedProcessingTimeMinutes.toFixed(1)} minutes`);
|
||||||
}
|
console.log('Note: Actual costs may vary based on image complexity and actual response lengths.');
|
||||||
}
|
} catch (err) {
|
||||||
}
|
console.error('Error estimating costs:', err);
|
||||||
|
}
|
||||||
// Only run the main function if this file is executed directly (not imported)
|
} else {
|
||||||
if (require.main === module) {
|
try {
|
||||||
main().catch(err => console.error('Unhandled error:', err));
|
const stats = createStats();
|
||||||
}
|
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||||
|
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||||
// Export functions and types for use as a module
|
await generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||||
export {
|
} catch (err) {
|
||||||
generateAudioDescription,
|
console.error('Error generating audio description:', err);
|
||||||
estimateCost,
|
}
|
||||||
defaultConfig,
|
}
|
||||||
VisionProviderFactory,
|
}
|
||||||
TTSProviderFactory,
|
|
||||||
ProcessingResult,
|
|
||||||
CostBreakdown
|
|
||||||
};
|
|
||||||
|
|||||||
@@ -1,25 +1,18 @@
|
|||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
|
import { GoogleGenerativeAI } from '@google/generative-ai';
|
||||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||||
|
|
||||||
type GoogleGenerativeAI = any;
|
|
||||||
type GenerativeModel = any;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Google Gemini Vision Provider Implementation
|
* Google Gemini Vision Provider Implementation
|
||||||
*/
|
*/
|
||||||
export class GeminiVisionProvider implements VisionProvider {
|
export class GeminiVisionProvider implements VisionProvider {
|
||||||
private config: VisionProviderConfig;
|
private config: VisionProviderConfig;
|
||||||
private genAI: GoogleGenerativeAI;
|
private genAI: GoogleGenerativeAI;
|
||||||
private model: GenerativeModel;
|
private model: any;
|
||||||
|
|
||||||
constructor(config: VisionProviderConfig) {
|
constructor(config: VisionProviderConfig) {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
|
this.genAI = new GoogleGenerativeAI(config.apiKey!);
|
||||||
// Import the Google Generative AI SDK
|
|
||||||
const { GoogleGenerativeAI } = require("@google/generative-ai");
|
|
||||||
|
|
||||||
// Initialize the API
|
|
||||||
this.genAI = new GoogleGenerativeAI(config.apiKey);
|
|
||||||
this.model = this.genAI.getGenerativeModel({ model: config.model });
|
this.model = this.genAI.getGenerativeModel({ model: config.model });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ export class OpenAIVisionProvider implements VisionProvider {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
max_tokens: this.config.maxTokens || 300
|
max_completion_tokens: this.config.maxTokens || 300
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -101,7 +101,7 @@ export class OpenAIVisionProvider implements VisionProvider {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
max_tokens: this.config.maxTokens || 300
|
max_completion_tokens: this.config.maxTokens || 300
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -171,7 +171,7 @@ export class OpenAIVisionProvider implements VisionProvider {
|
|||||||
const response = await this.openai.chat.completions.create({
|
const response = await this.openai.chat.completions.create({
|
||||||
model: this.config.model,
|
model: this.config.model,
|
||||||
messages,
|
messages,
|
||||||
max_tokens: this.config.maxTokens || 300
|
max_completion_tokens: this.config.maxTokens || 300
|
||||||
});
|
});
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -37,31 +37,29 @@ export async function estimateCost(
|
|||||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||||
|
|
||||||
// Pricing constants (as of March 2025, update as needed)
|
// Pricing constants (as of March 2025, update as needed)
|
||||||
const pricing = {
|
const pricing: {
|
||||||
// Get pricing based on vision provider
|
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||||
|
tts: Record<string, Record<string, number>>;
|
||||||
|
} = {
|
||||||
vision: {
|
vision: {
|
||||||
openai: {
|
openai: {
|
||||||
'gpt-4o': {
|
'gpt-4o': {
|
||||||
input: 0.0025, // per 1K input tokens
|
input: 0.0025,
|
||||||
output: 0.01 // per 1K output tokens
|
output: 0.01
|
||||||
}
|
}
|
||||||
// Add other OpenAI models here
|
|
||||||
},
|
},
|
||||||
gemini: {
|
gemini: {
|
||||||
'gemini-pro-vision': {
|
'gemini-pro-vision': {
|
||||||
input: 0.0025, // per 1K input tokens
|
input: 0.0025,
|
||||||
output: 0.0025 // per 1K output tokens
|
output: 0.0025
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add other vision providers here
|
|
||||||
},
|
},
|
||||||
// Get pricing based on TTS provider
|
|
||||||
tts: {
|
tts: {
|
||||||
openai: {
|
openai: {
|
||||||
'tts-1': 0.015, // per 1K characters
|
'tts-1': 0.015,
|
||||||
'tts-1-hd': 0.030 // per 1K characters
|
'tts-1-hd': 0.030
|
||||||
}
|
}
|
||||||
// Add other TTS providers here
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -8,8 +8,10 @@ import {
|
|||||||
BatchContext,
|
BatchContext,
|
||||||
ProcessingResult
|
ProcessingResult
|
||||||
} from '../interfaces';
|
} from '../interfaces';
|
||||||
import { Config } from '../config/config';
|
import { Config, getDefaultConfig } from '../config/config';
|
||||||
import { printStats } from '../config/stats';
|
import { printStats, createStats } from '../config/stats';
|
||||||
|
import { VisionProviderFactory } from '../providers/vision/visionProviderFactory';
|
||||||
|
import { TTSProviderFactory } from '../providers/tts/ttsProviderFactory';
|
||||||
import {
|
import {
|
||||||
getVideoDuration,
|
getVideoDuration,
|
||||||
captureVideoFrame,
|
captureVideoFrame,
|
||||||
@@ -17,7 +19,35 @@ import {
|
|||||||
} from './mediaUtils';
|
} from './mediaUtils';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate audio description for a video
|
* High-level API: Generate audio description for a video with just options.
|
||||||
|
* This internally creates providers and stats so callers don't need to.
|
||||||
|
*
|
||||||
|
* @param videoFilePath - Path to the input video file
|
||||||
|
* @param options - Optional configuration overrides
|
||||||
|
* @returns Result of the operation
|
||||||
|
*/
|
||||||
|
export async function generateAudioDescriptionFromOptions(
|
||||||
|
videoFilePath: string,
|
||||||
|
options: Partial<Config> = {}
|
||||||
|
): Promise<ProcessingResult> {
|
||||||
|
const config = { ...getDefaultConfig(), ...options };
|
||||||
|
|
||||||
|
if (!fs.existsSync(config.tempDir)) {
|
||||||
|
fs.mkdirSync(config.tempDir, { recursive: true });
|
||||||
|
}
|
||||||
|
if (!fs.existsSync(config.outputDir)) {
|
||||||
|
fs.mkdirSync(config.outputDir, { recursive: true });
|
||||||
|
}
|
||||||
|
|
||||||
|
const visionProvider = VisionProviderFactory.getProvider(config);
|
||||||
|
const ttsProvider = TTSProviderFactory.getProvider(config);
|
||||||
|
const stats = createStats();
|
||||||
|
|
||||||
|
return generateAudioDescription(videoFilePath, visionProvider, ttsProvider, config, stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate audio description for a video (low-level API requiring pre-initialized providers).
|
||||||
* @param videoFilePath - Path to the input video file
|
* @param videoFilePath - Path to the input video file
|
||||||
* @param visionProvider - Vision provider instance
|
* @param visionProvider - Vision provider instance
|
||||||
* @param ttsProvider - TTS provider instance
|
* @param ttsProvider - TTS provider instance
|
||||||
|
|||||||
Reference in New Issue
Block a user