Compare commits
5 Commits
fc02d2001c
...
3a198d7d50
| Author | SHA1 | Date | |
|---|---|---|---|
| 3a198d7d50 | |||
| f05e57493c | |||
| 6e9a26557f | |||
| eb15af3a36 | |||
| 19975917c5 |
895
package-lock.json
generated
895
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -19,6 +19,7 @@
|
||||
"prepublishOnly": "npm run build"
|
||||
},
|
||||
"dependencies": {
|
||||
"@google-cloud/text-to-speech": "^6.4.1",
|
||||
"@google/generative-ai": "^0.24.0",
|
||||
"axios": "^1.6.2",
|
||||
"dotenv": "^16.3.1",
|
||||
@@ -51,4 +52,4 @@
|
||||
],
|
||||
"author": "",
|
||||
"license": "MIT"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ export interface CLIArgs {
|
||||
ttsModel?: string;
|
||||
ttsVoice?: string;
|
||||
ttsSpeedFactor?: number;
|
||||
ttsInstructions?: string;
|
||||
outputDir?: string;
|
||||
tempDir?: string;
|
||||
batchTimeMode?: boolean;
|
||||
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
|
||||
describe: 'Speed factor for the audio playback',
|
||||
type: 'number'
|
||||
})
|
||||
.option('ttsInstructions', {
|
||||
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
|
||||
type: 'string'
|
||||
})
|
||||
.option('outputDir', {
|
||||
alias: 'o',
|
||||
describe: 'Directory for output files',
|
||||
|
||||
@@ -18,6 +18,7 @@ export interface Config {
|
||||
ttsProvider: string;
|
||||
ttsVoice: string;
|
||||
ttsSpeedFactor: number;
|
||||
ttsInstructions?: string;
|
||||
ttsProviders: {
|
||||
[key: string]: TTSProviderConfig;
|
||||
};
|
||||
@@ -61,6 +62,12 @@ export function getDefaultConfig(): Config {
|
||||
baseUrl: "http://localhost:11434",
|
||||
model: "gemma3:12b",
|
||||
maxTokens: 3000
|
||||
},
|
||||
openrouter: {
|
||||
apiKey: process.env.OPENROUTER_API_KEY,
|
||||
model: "anthropic/claude-sonnet-4.5",
|
||||
baseUrl: "https://openrouter.ai/api/v1",
|
||||
maxTokens: 300
|
||||
}
|
||||
},
|
||||
|
||||
@@ -68,11 +75,23 @@ export function getDefaultConfig(): Config {
|
||||
ttsProvider: "openai",
|
||||
ttsVoice: "alloy",
|
||||
ttsSpeedFactor: 1.5,
|
||||
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
|
||||
ttsProviders: {
|
||||
openai: {
|
||||
apiKey: process.env.OPENAI_API_KEY,
|
||||
model: "tts-1-hd",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy"
|
||||
},
|
||||
elevenlabs: {
|
||||
apiKey: process.env.ELEVENLABS_API_KEY,
|
||||
model: "eleven_multilingual_v2",
|
||||
voice: "JBFqnCBsd6RMkjVDRZzb"
|
||||
},
|
||||
google: {
|
||||
apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
|
||||
keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
|
||||
model: "chirp-hd",
|
||||
voice: "en-US-Chirp-HD-F"
|
||||
}
|
||||
},
|
||||
|
||||
|
||||
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
|
||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||
}
|
||||
|
||||
if (argv.ttsInstructions) {
|
||||
config.ttsInstructions = argv.ttsInstructions;
|
||||
}
|
||||
|
||||
if (argv.saveConfig) {
|
||||
saveConfigToFile(argv.saveConfig, config);
|
||||
}
|
||||
|
||||
@@ -39,12 +39,14 @@ export interface TTSOptions {
|
||||
voice?: string;
|
||||
model?: string;
|
||||
speedFactor?: number;
|
||||
instructions?: string;
|
||||
}
|
||||
|
||||
export interface TTSProviderConfig {
|
||||
apiKey?: string;
|
||||
model: string;
|
||||
voice?: string;
|
||||
keyFilename?: string;
|
||||
}
|
||||
|
||||
export interface TTSProvider {
|
||||
|
||||
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import fs from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||
|
||||
export class ElevenLabsTTSProvider implements TTSProvider {
|
||||
private config: TTSProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
private lastRequestId: string | null = null;
|
||||
|
||||
constructor(config: TTSProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: 'https://api.elevenlabs.io/v1',
|
||||
headers: {
|
||||
'xi-api-key': config.apiKey,
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options: TTSOptions = {}
|
||||
): Promise<TTSResult> {
|
||||
try {
|
||||
const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
|
||||
const model = options.model || this.config.model || 'eleven_multilingual_v2';
|
||||
const speedFactor = options.speedFactor || 1.0;
|
||||
|
||||
const requestBody: any = {
|
||||
text,
|
||||
model_id: model,
|
||||
voice_settings: {
|
||||
stability: 0.5,
|
||||
similarity_boost: 0.75,
|
||||
speed: speedFactor,
|
||||
use_speaker_boost: true
|
||||
}
|
||||
};
|
||||
|
||||
if (this.lastRequestId) {
|
||||
requestBody.previous_request_ids = [this.lastRequestId];
|
||||
}
|
||||
|
||||
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||
|
||||
const response = await this.axiosInstance.post(
|
||||
`/text-to-speech/${voice}`,
|
||||
requestBody,
|
||||
{
|
||||
params: { output_format: 'mp3_44100_128' },
|
||||
responseType: 'arraybuffer'
|
||||
}
|
||||
);
|
||||
|
||||
this.lastRequestId = response.headers['request-id'] || null;
|
||||
|
||||
const audioBuffer = Buffer.from(response.data);
|
||||
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||
|
||||
const cost = text.length;
|
||||
|
||||
if (speedFactor !== 1.0) {
|
||||
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
fs.unlinkSync(tempOutputPath);
|
||||
} else {
|
||||
fs.renameSync(tempOutputPath, outputPath);
|
||||
}
|
||||
|
||||
const audioDuration = getAudioDuration(outputPath);
|
||||
|
||||
return {
|
||||
duration: audioDuration,
|
||||
cost: cost
|
||||
};
|
||||
} catch (error: any) {
|
||||
if (error.response) {
|
||||
console.error(`ElevenLabs TTS error (${error.response.status}):`,
|
||||
Buffer.from(error.response.data).toString());
|
||||
} else {
|
||||
console.error('ElevenLabs TTS error:', error.message);
|
||||
}
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||
return {
|
||||
duration: 1,
|
||||
cost: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
import fs from 'fs';
|
||||
import { execSync } from 'child_process';
|
||||
import { TextToSpeechClient } from '@google-cloud/text-to-speech';
|
||||
import { google } from '@google-cloud/text-to-speech/build/protos/protos';
|
||||
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||
|
||||
export class GoogleCloudTTSProvider implements TTSProvider {
|
||||
private config: TTSProviderConfig;
|
||||
private client: TextToSpeechClient;
|
||||
|
||||
constructor(config: TTSProviderConfig) {
|
||||
this.config = config;
|
||||
|
||||
const clientConfig: any = {
|
||||
apiKey: config.apiKey,
|
||||
fallback: true
|
||||
};
|
||||
|
||||
if (config.keyFilename) {
|
||||
clientConfig.keyFilename = config.keyFilename;
|
||||
}
|
||||
|
||||
this.client = new TextToSpeechClient(clientConfig);
|
||||
}
|
||||
|
||||
async textToSpeech(
|
||||
text: string,
|
||||
outputPath: string,
|
||||
options: TTSOptions = {}
|
||||
): Promise<TTSResult> {
|
||||
try {
|
||||
const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
|
||||
const model = options.model || this.config.model || 'chirp-hd';
|
||||
const speedFactor = options.speedFactor || 1.0;
|
||||
|
||||
const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
|
||||
input: { text },
|
||||
voice: {
|
||||
languageCode: this.extractLanguageCode(voice),
|
||||
name: voice
|
||||
},
|
||||
audioConfig: {
|
||||
audioEncoding: 'MP3',
|
||||
speakingRate: speedFactor
|
||||
}
|
||||
};
|
||||
|
||||
const [response] = await this.client.synthesizeSpeech(request);
|
||||
|
||||
if (!response.audioContent) {
|
||||
throw new Error('No audio content returned from Google Cloud TTS');
|
||||
}
|
||||
|
||||
const audioBuffer = response.audioContent instanceof Uint8Array
|
||||
? Buffer.from(response.audioContent)
|
||||
: Buffer.from(response.audioContent as any);
|
||||
|
||||
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||
|
||||
const cost = text.length;
|
||||
|
||||
if (speedFactor !== 1.0) {
|
||||
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||
fs.unlinkSync(tempOutputPath);
|
||||
} else {
|
||||
fs.renameSync(tempOutputPath, outputPath);
|
||||
}
|
||||
|
||||
const audioDuration = getAudioDuration(outputPath);
|
||||
|
||||
return {
|
||||
duration: audioDuration,
|
||||
cost: cost
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('Google Cloud TTS error:', error.message);
|
||||
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||
return {
|
||||
duration: 1,
|
||||
cost: 0
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private extractLanguageCode(voiceName: string): string {
|
||||
const parts = voiceName.split('-');
|
||||
if (parts.length >= 2) {
|
||||
return `${parts[0]}-${parts[1]}`;
|
||||
}
|
||||
return 'en-US';
|
||||
}
|
||||
}
|
||||
@@ -1,2 +1,4 @@
|
||||
export * from './ttsProviderFactory';
|
||||
export * from './openAITTSProvider';
|
||||
export * from './openAITTSProvider';
|
||||
export * from './elevenLabsTTSProvider';
|
||||
export * from './googleCloudTTSProvider';
|
||||
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
|
||||
|
||||
const mp3 = await this.openai.audio.speech.create({
|
||||
model: model,
|
||||
voice: voice as any, // Type casting to any to avoid type issues
|
||||
input: text
|
||||
voice: voice as any,
|
||||
input: text,
|
||||
...(options.instructions ? { instructions: options.instructions } : {})
|
||||
});
|
||||
|
||||
// Cost calculation is based on character count
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import { TTSProvider } from '../../interfaces';
|
||||
import { Config } from '../../config/config';
|
||||
import { OpenAITTSProvider } from './openAITTSProvider';
|
||||
import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
|
||||
import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';
|
||||
|
||||
/**
|
||||
* Factory for creating TTS providers
|
||||
@@ -17,6 +19,10 @@ export class TTSProviderFactory {
|
||||
switch (providerName) {
|
||||
case 'openai':
|
||||
return new OpenAITTSProvider(providerConfig);
|
||||
case 'elevenlabs':
|
||||
return new ElevenLabsTTSProvider(providerConfig);
|
||||
case 'google':
|
||||
return new GoogleCloudTTSProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`TTS provider "${providerName}" not implemented.`);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
export * from './visionProviderFactory';
|
||||
export * from './openAIVisionProvider';
|
||||
export * from './geminiVisionProvider';
|
||||
export * from './ollamaVisionProvider';
|
||||
export * from './ollamaVisionProvider';
|
||||
export * from './openRouterVisionProvider';
|
||||
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import fs from 'fs';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
export class OpenRouterVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${config.apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
|
||||
'X-Title': 'Aidio Description Generator'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter describeImage error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe this image.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
const base64Image1 = image1Data.toString('base64');
|
||||
const base64Image2 = image2Data.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt },
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
|
||||
},
|
||||
{
|
||||
type: 'image_url',
|
||||
image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter compareImages error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe the differences between these images.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
const messages: any[] = [
|
||||
{
|
||||
role: 'user',
|
||||
content: [
|
||||
{ type: 'text', text: prompt }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
messages.unshift({
|
||||
role: 'system',
|
||||
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||||
});
|
||||
}
|
||||
|
||||
imagesBase64.forEach(base64 => {
|
||||
messages[messages.length - 1].content.push({
|
||||
type: 'image_url',
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64}`
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const response = await this.axiosInstance.post('/chat/completions', {
|
||||
model: this.config.model,
|
||||
messages,
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
const data = response.data;
|
||||
return {
|
||||
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||
usage: {
|
||||
inputTokens: data.usage?.prompt_tokens || 0,
|
||||
outputTokens: data.usage?.completion_tokens || 0,
|
||||
totalTokens: data.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
|
||||
return {
|
||||
description: 'Unable to describe this batch of images.',
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ import { Config } from '../../config/config';
|
||||
import { OpenAIVisionProvider } from './openAIVisionProvider';
|
||||
import { GeminiVisionProvider } from './geminiVisionProvider';
|
||||
import { OllamaVisionProvider } from './ollamaVisionProvider';
|
||||
import { OpenRouterVisionProvider } from './openRouterVisionProvider';
|
||||
|
||||
/**
|
||||
* Factory for creating vision AI providers
|
||||
@@ -23,6 +24,8 @@ export class VisionProviderFactory {
|
||||
return new GeminiVisionProvider(providerConfig);
|
||||
case "ollama":
|
||||
return new OllamaVisionProvider(providerConfig);
|
||||
case 'openrouter':
|
||||
return new OpenRouterVisionProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
||||
|
||||
@@ -1,150 +1,182 @@
|
||||
import { Config } from '../config/config';
|
||||
import { CostBreakdown } from '../interfaces';
|
||||
import { getVideoDuration } from './mediaUtils';
|
||||
|
||||
/**
|
||||
* Estimate the cost of generating audio descriptions for a video
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Cost estimation breakdown
|
||||
*/
|
||||
export async function estimateCost(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<CostBreakdown> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// Calculate the number of frames or batches to process
|
||||
let totalUnits: number;
|
||||
let unitCostMultiplier: number;
|
||||
let unitType: string;
|
||||
|
||||
if (settings.batchTimeMode) {
|
||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
|
||||
unitType = "batches";
|
||||
} else {
|
||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
unitCostMultiplier = 1; // No multiplier for normal mode
|
||||
unitType = "frames";
|
||||
}
|
||||
|
||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||
|
||||
// Pricing constants (as of March 2025, update as needed)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, number>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': {
|
||||
input: 0.0025,
|
||||
output: 0.01
|
||||
}
|
||||
},
|
||||
gemini: {
|
||||
'gemini-pro-vision': {
|
||||
input: 0.0025,
|
||||
output: 0.0025
|
||||
}
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Estimated token counts
|
||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
|
||||
const estimatedPromptTokens = 100; // Tokens for the prompt text
|
||||
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
|
||||
|
||||
// Estimated character counts for TTS
|
||||
const estimatedCharsPerDescription = 200; // Average characters per description
|
||||
|
||||
// Calculate estimated costs for first unit
|
||||
const firstUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
||||
};
|
||||
|
||||
// For subsequent units, we need context (e.g., previous frames)
|
||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
|
||||
|
||||
const subsequentUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
||||
};
|
||||
|
||||
// Calculate total costs
|
||||
const totalVisionInputCost =
|
||||
firstUnitCost.visionInput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionInput;
|
||||
|
||||
const totalVisionOutputCost =
|
||||
firstUnitCost.visionOutput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionOutput;
|
||||
|
||||
const totalTTSCost =
|
||||
firstUnitCost.tts +
|
||||
(totalUnits - 1) * subsequentUnitCost.tts;
|
||||
|
||||
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||||
|
||||
// Create cost breakdown
|
||||
const costBreakdown: CostBreakdown = {
|
||||
videoInfo: {
|
||||
duration: videoDuration,
|
||||
totalUnits: totalUnits,
|
||||
unitType: unitType,
|
||||
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
|
||||
},
|
||||
providerInfo: {
|
||||
visionProvider: visionProvider,
|
||||
visionModel: visionModel,
|
||||
ttsProvider: ttsProvider,
|
||||
ttsModel: ttsModel
|
||||
},
|
||||
apiCosts: {
|
||||
visionInput: totalVisionInputCost.toFixed(4),
|
||||
visionOutput: totalVisionOutputCost.toFixed(4),
|
||||
tts: totalTTSCost.toFixed(4),
|
||||
total: totalCost.toFixed(4)
|
||||
},
|
||||
estimates: {
|
||||
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
|
||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
|
||||
}
|
||||
};
|
||||
|
||||
return costBreakdown;
|
||||
}
|
||||
import { Config } from '../config/config';
|
||||
import { CostBreakdown } from '../interfaces';
|
||||
import { getVideoDuration } from './mediaUtils';
|
||||
|
||||
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
|
||||
|
||||
/**
|
||||
* Estimate the cost of generating audio descriptions for a video
|
||||
* @param videoFilePath - Path to the input video file
|
||||
* @param options - Optional configuration overrides
|
||||
* @returns Cost estimation breakdown
|
||||
*/
|
||||
export async function estimateCost(
|
||||
videoFilePath: string,
|
||||
options: Partial<Config> = {}
|
||||
): Promise<CostBreakdown> {
|
||||
// Merge provided options with defaults
|
||||
const settings = { ...options } as Config;
|
||||
|
||||
// Get video duration
|
||||
const videoDuration = getVideoDuration(videoFilePath);
|
||||
console.log(`Video duration: ${videoDuration} seconds`);
|
||||
|
||||
// Calculate the number of frames or batches to process
|
||||
let totalUnits: number;
|
||||
let unitCostMultiplier: number;
|
||||
let unitType: string;
|
||||
|
||||
if (settings.batchTimeMode) {
|
||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||
unitCostMultiplier = settings.framesInBatch;
|
||||
unitType = "batches";
|
||||
} else {
|
||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||
unitCostMultiplier = 1;
|
||||
unitType = "frames";
|
||||
}
|
||||
|
||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||
|
||||
// Pricing constants (per 1K units unless otherwise noted)
|
||||
const pricing: {
|
||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||
tts: Record<string, Record<string, TTSPricingModel>>;
|
||||
} = {
|
||||
vision: {
|
||||
openai: {
|
||||
'gpt-4o': { input: 0.0025, output: 0.01 },
|
||||
'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
|
||||
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
|
||||
},
|
||||
gemini: {
|
||||
'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
|
||||
'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
|
||||
'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
|
||||
},
|
||||
openrouter: {
|
||||
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
|
||||
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
|
||||
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
|
||||
}
|
||||
},
|
||||
tts: {
|
||||
openai: {
|
||||
'tts-1': 0.015,
|
||||
'tts-1-hd': 0.030,
|
||||
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
|
||||
},
|
||||
elevenlabs: {
|
||||
'eleven_multilingual_v2': 0.30,
|
||||
'eleven_turbo_v2.5': 0.015
|
||||
},
|
||||
google: {
|
||||
'chirp-hd': 0.016,
|
||||
'wavenet': 0.016,
|
||||
'neural2': 0.016,
|
||||
'standard': 0.004
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Get the pricing for the selected providers
|
||||
const visionProvider = settings.visionProvider;
|
||||
const visionModel = settings.visionProviders[visionProvider].model;
|
||||
const ttsProvider = settings.ttsProvider;
|
||||
const ttsModel = settings.ttsProviders[ttsProvider].model;
|
||||
|
||||
// Check if the pricing data exists
|
||||
const visionPricing = pricing.vision[visionProvider]?.[visionModel];
|
||||
const ttsPricing = pricing.tts[ttsProvider]?.[ttsModel];
|
||||
|
||||
if (!visionPricing) {
|
||||
console.warn(`Warning: No pricing data for vision provider "${visionProvider}" and model "${visionModel}".`);
|
||||
}
|
||||
|
||||
if (!ttsPricing) {
|
||||
console.warn(`Warning: No pricing data for TTS provider "${ttsProvider}" and model "${ttsModel}".`);
|
||||
}
|
||||
|
||||
// Estimated token counts
|
||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
|
||||
const estimatedPromptTokens = 100;
|
||||
const estimatedOutputTokensPerUnit = 75;
|
||||
|
||||
// Estimated character counts for TTS
|
||||
const estimatedCharsPerDescription = 200;
|
||||
|
||||
// Calculate estimated costs for first unit
|
||||
const firstUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// For subsequent units, we need context (e.g., previous frames)
|
||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
|
||||
|
||||
const subsequentUnitCost = {
|
||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||
};
|
||||
|
||||
// Calculate total costs
|
||||
const totalVisionInputCost =
|
||||
firstUnitCost.visionInput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionInput;
|
||||
|
||||
const totalVisionOutputCost =
|
||||
firstUnitCost.visionOutput +
|
||||
(totalUnits - 1) * subsequentUnitCost.visionOutput;
|
||||
|
||||
const totalTTSCost =
|
||||
firstUnitCost.tts +
|
||||
(totalUnits - 1) * subsequentUnitCost.tts;
|
||||
|
||||
const totalCost = totalVisionInputCost + totalVisionOutputCost + totalTTSCost;
|
||||
|
||||
// Create cost breakdown
|
||||
const costBreakdown: CostBreakdown = {
|
||||
videoInfo: {
|
||||
duration: videoDuration,
|
||||
totalUnits: totalUnits,
|
||||
unitType: unitType,
|
||||
processingInterval: settings.batchTimeMode ? settings.batchWindowDuration : settings.captureIntervalSeconds
|
||||
},
|
||||
providerInfo: {
|
||||
visionProvider: visionProvider,
|
||||
visionModel: visionModel,
|
||||
ttsProvider: ttsProvider,
|
||||
ttsModel: ttsModel
|
||||
},
|
||||
apiCosts: {
|
||||
visionInput: totalVisionInputCost.toFixed(4),
|
||||
visionOutput: totalVisionOutputCost.toFixed(4),
|
||||
tts: totalTTSCost.toFixed(4),
|
||||
total: totalCost.toFixed(4)
|
||||
},
|
||||
estimates: {
|
||||
totalAPICallsToProviders: totalUnits * 2,
|
||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
|
||||
}
|
||||
};
|
||||
|
||||
return costBreakdown;
|
||||
}
|
||||
|
||||
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
|
||||
if (!pricing) return 0;
|
||||
|
||||
if (typeof pricing === 'number') {
|
||||
// Per-character pricing: cost per 1000 characters
|
||||
return charCount * pricing / 1000;
|
||||
}
|
||||
|
||||
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
|
||||
// Rough estimate: 1 char ≈ 0.25 tokens for English text
|
||||
const estimatedInputTokens = charCount * 0.25;
|
||||
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
|
||||
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
|
||||
}
|
||||
|
||||
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
|
||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||
voice: settings.ttsVoice,
|
||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||
speedFactor: settings.ttsSpeedFactor
|
||||
speedFactor: settings.ttsSpeedFactor,
|
||||
instructions: settings.ttsInstructions
|
||||
});
|
||||
|
||||
const audioDuration = ttsResult.duration;
|
||||
|
||||
Reference in New Issue
Block a user