Compare commits
5 Commits
fc02d2001c
...
3a198d7d50
| Author | SHA1 | Date | |
|---|---|---|---|
| 3a198d7d50 | |||
| f05e57493c | |||
| 6e9a26557f | |||
| eb15af3a36 | |||
| 19975917c5 |
895
package-lock.json
generated
895
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -19,6 +19,7 @@
|
|||||||
"prepublishOnly": "npm run build"
|
"prepublishOnly": "npm run build"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@google-cloud/text-to-speech": "^6.4.1",
|
||||||
"@google/generative-ai": "^0.24.0",
|
"@google/generative-ai": "^0.24.0",
|
||||||
"axios": "^1.6.2",
|
"axios": "^1.6.2",
|
||||||
"dotenv": "^16.3.1",
|
"dotenv": "^16.3.1",
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ export interface CLIArgs {
|
|||||||
ttsModel?: string;
|
ttsModel?: string;
|
||||||
ttsVoice?: string;
|
ttsVoice?: string;
|
||||||
ttsSpeedFactor?: number;
|
ttsSpeedFactor?: number;
|
||||||
|
ttsInstructions?: string;
|
||||||
outputDir?: string;
|
outputDir?: string;
|
||||||
tempDir?: string;
|
tempDir?: string;
|
||||||
batchTimeMode?: boolean;
|
batchTimeMode?: boolean;
|
||||||
@@ -73,6 +74,10 @@ export function parseCommandLineArgs(): CLIArgs {
|
|||||||
describe: 'Speed factor for the audio playback',
|
describe: 'Speed factor for the audio playback',
|
||||||
type: 'number'
|
type: 'number'
|
||||||
})
|
})
|
||||||
|
.option('ttsInstructions', {
|
||||||
|
describe: 'Instructions for TTS voice style (gpt-4o-mini-tts)',
|
||||||
|
type: 'string'
|
||||||
|
})
|
||||||
.option('outputDir', {
|
.option('outputDir', {
|
||||||
alias: 'o',
|
alias: 'o',
|
||||||
describe: 'Directory for output files',
|
describe: 'Directory for output files',
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ export interface Config {
|
|||||||
ttsProvider: string;
|
ttsProvider: string;
|
||||||
ttsVoice: string;
|
ttsVoice: string;
|
||||||
ttsSpeedFactor: number;
|
ttsSpeedFactor: number;
|
||||||
|
ttsInstructions?: string;
|
||||||
ttsProviders: {
|
ttsProviders: {
|
||||||
[key: string]: TTSProviderConfig;
|
[key: string]: TTSProviderConfig;
|
||||||
};
|
};
|
||||||
@@ -61,6 +62,12 @@ export function getDefaultConfig(): Config {
|
|||||||
baseUrl: "http://localhost:11434",
|
baseUrl: "http://localhost:11434",
|
||||||
model: "gemma3:12b",
|
model: "gemma3:12b",
|
||||||
maxTokens: 3000
|
maxTokens: 3000
|
||||||
|
},
|
||||||
|
openrouter: {
|
||||||
|
apiKey: process.env.OPENROUTER_API_KEY,
|
||||||
|
model: "anthropic/claude-sonnet-4.5",
|
||||||
|
baseUrl: "https://openrouter.ai/api/v1",
|
||||||
|
maxTokens: 300
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -68,11 +75,23 @@ export function getDefaultConfig(): Config {
|
|||||||
ttsProvider: "openai",
|
ttsProvider: "openai",
|
||||||
ttsVoice: "alloy",
|
ttsVoice: "alloy",
|
||||||
ttsSpeedFactor: 1.5,
|
ttsSpeedFactor: 1.5,
|
||||||
|
ttsInstructions: "Speak in a calm, narrating tone suitable for audio descriptions. Keep a steady pace and clear enunciation.",
|
||||||
ttsProviders: {
|
ttsProviders: {
|
||||||
openai: {
|
openai: {
|
||||||
apiKey: process.env.OPENAI_API_KEY,
|
apiKey: process.env.OPENAI_API_KEY,
|
||||||
model: "tts-1-hd",
|
model: "gpt-4o-mini-tts",
|
||||||
voice: "alloy"
|
voice: "alloy"
|
||||||
|
},
|
||||||
|
elevenlabs: {
|
||||||
|
apiKey: process.env.ELEVENLABS_API_KEY,
|
||||||
|
model: "eleven_multilingual_v2",
|
||||||
|
voice: "JBFqnCBsd6RMkjVDRZzb"
|
||||||
|
},
|
||||||
|
google: {
|
||||||
|
apiKey: process.env.GOOGLE_CLOUD_TTS_KEY,
|
||||||
|
keyFilename: process.env.GOOGLE_CLOUD_TTS_KEYFILE,
|
||||||
|
model: "chirp-hd",
|
||||||
|
voice: "en-US-Chirp-HD-F"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|||||||
@@ -77,6 +77,10 @@ async function main(): Promise<void> {
|
|||||||
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
config.ttsProviders[config.ttsProvider].voice = argv.ttsVoice;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.ttsInstructions) {
|
||||||
|
config.ttsInstructions = argv.ttsInstructions;
|
||||||
|
}
|
||||||
|
|
||||||
if (argv.saveConfig) {
|
if (argv.saveConfig) {
|
||||||
saveConfigToFile(argv.saveConfig, config);
|
saveConfigToFile(argv.saveConfig, config);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,12 +39,14 @@ export interface TTSOptions {
|
|||||||
voice?: string;
|
voice?: string;
|
||||||
model?: string;
|
model?: string;
|
||||||
speedFactor?: number;
|
speedFactor?: number;
|
||||||
|
instructions?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TTSProviderConfig {
|
export interface TTSProviderConfig {
|
||||||
apiKey?: string;
|
apiKey?: string;
|
||||||
model: string;
|
model: string;
|
||||||
voice?: string;
|
voice?: string;
|
||||||
|
keyFilename?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TTSProvider {
|
export interface TTSProvider {
|
||||||
|
|||||||
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
93
src/providers/tts/elevenLabsTTSProvider.ts
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
import fs from 'fs';
|
||||||
|
import { execSync } from 'child_process';
|
||||||
|
import axios, { AxiosInstance } from 'axios';
|
||||||
|
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||||
|
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||||
|
|
||||||
|
export class ElevenLabsTTSProvider implements TTSProvider {
|
||||||
|
private config: TTSProviderConfig;
|
||||||
|
private axiosInstance: AxiosInstance;
|
||||||
|
private lastRequestId: string | null = null;
|
||||||
|
|
||||||
|
constructor(config: TTSProviderConfig) {
|
||||||
|
this.config = config;
|
||||||
|
this.axiosInstance = axios.create({
|
||||||
|
baseURL: 'https://api.elevenlabs.io/v1',
|
||||||
|
headers: {
|
||||||
|
'xi-api-key': config.apiKey,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async textToSpeech(
|
||||||
|
text: string,
|
||||||
|
outputPath: string,
|
||||||
|
options: TTSOptions = {}
|
||||||
|
): Promise<TTSResult> {
|
||||||
|
try {
|
||||||
|
const voice = options.voice || this.config.voice || 'JBFqnCBsd6RMkjVDRZzb';
|
||||||
|
const model = options.model || this.config.model || 'eleven_multilingual_v2';
|
||||||
|
const speedFactor = options.speedFactor || 1.0;
|
||||||
|
|
||||||
|
const requestBody: any = {
|
||||||
|
text,
|
||||||
|
model_id: model,
|
||||||
|
voice_settings: {
|
||||||
|
stability: 0.5,
|
||||||
|
similarity_boost: 0.75,
|
||||||
|
speed: speedFactor,
|
||||||
|
use_speaker_boost: true
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (this.lastRequestId) {
|
||||||
|
requestBody.previous_request_ids = [this.lastRequestId];
|
||||||
|
}
|
||||||
|
|
||||||
|
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||||
|
|
||||||
|
const response = await this.axiosInstance.post(
|
||||||
|
`/text-to-speech/${voice}`,
|
||||||
|
requestBody,
|
||||||
|
{
|
||||||
|
params: { output_format: 'mp3_44100_128' },
|
||||||
|
responseType: 'arraybuffer'
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
this.lastRequestId = response.headers['request-id'] || null;
|
||||||
|
|
||||||
|
const audioBuffer = Buffer.from(response.data);
|
||||||
|
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||||
|
|
||||||
|
const cost = text.length;
|
||||||
|
|
||||||
|
if (speedFactor !== 1.0) {
|
||||||
|
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||||
|
fs.unlinkSync(tempOutputPath);
|
||||||
|
} else {
|
||||||
|
fs.renameSync(tempOutputPath, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioDuration = getAudioDuration(outputPath);
|
||||||
|
|
||||||
|
return {
|
||||||
|
duration: audioDuration,
|
||||||
|
cost: cost
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
if (error.response) {
|
||||||
|
console.error(`ElevenLabs TTS error (${error.response.status}):`,
|
||||||
|
Buffer.from(error.response.data).toString());
|
||||||
|
} else {
|
||||||
|
console.error('ElevenLabs TTS error:', error.message);
|
||||||
|
}
|
||||||
|
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||||
|
return {
|
||||||
|
duration: 1,
|
||||||
|
cost: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
94
src/providers/tts/googleCloudTTSProvider.ts
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
import fs from 'fs';
|
||||||
|
import { execSync } from 'child_process';
|
||||||
|
import { TextToSpeechClient } from '@google-cloud/text-to-speech';
|
||||||
|
import { google } from '@google-cloud/text-to-speech/build/protos/protos';
|
||||||
|
import { TTSProvider, TTSProviderConfig, TTSOptions, TTSResult } from '../../interfaces';
|
||||||
|
import { getAudioDuration } from '../../utils/mediaUtils';
|
||||||
|
|
||||||
|
export class GoogleCloudTTSProvider implements TTSProvider {
|
||||||
|
private config: TTSProviderConfig;
|
||||||
|
private client: TextToSpeechClient;
|
||||||
|
|
||||||
|
constructor(config: TTSProviderConfig) {
|
||||||
|
this.config = config;
|
||||||
|
|
||||||
|
const clientConfig: any = {
|
||||||
|
apiKey: config.apiKey,
|
||||||
|
fallback: true
|
||||||
|
};
|
||||||
|
|
||||||
|
if (config.keyFilename) {
|
||||||
|
clientConfig.keyFilename = config.keyFilename;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.client = new TextToSpeechClient(clientConfig);
|
||||||
|
}
|
||||||
|
|
||||||
|
async textToSpeech(
|
||||||
|
text: string,
|
||||||
|
outputPath: string,
|
||||||
|
options: TTSOptions = {}
|
||||||
|
): Promise<TTSResult> {
|
||||||
|
try {
|
||||||
|
const voice = options.voice || this.config.voice || 'en-US-Chirp-HD-F';
|
||||||
|
const model = options.model || this.config.model || 'chirp-hd';
|
||||||
|
const speedFactor = options.speedFactor || 1.0;
|
||||||
|
|
||||||
|
const request: google.cloud.texttospeech.v1.ISynthesizeSpeechRequest = {
|
||||||
|
input: { text },
|
||||||
|
voice: {
|
||||||
|
languageCode: this.extractLanguageCode(voice),
|
||||||
|
name: voice
|
||||||
|
},
|
||||||
|
audioConfig: {
|
||||||
|
audioEncoding: 'MP3',
|
||||||
|
speakingRate: speedFactor
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const [response] = await this.client.synthesizeSpeech(request);
|
||||||
|
|
||||||
|
if (!response.audioContent) {
|
||||||
|
throw new Error('No audio content returned from Google Cloud TTS');
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioBuffer = response.audioContent instanceof Uint8Array
|
||||||
|
? Buffer.from(response.audioContent)
|
||||||
|
: Buffer.from(response.audioContent as any);
|
||||||
|
|
||||||
|
const tempOutputPath = outputPath.replace(/\.\w+$/, '_temp$&');
|
||||||
|
fs.writeFileSync(tempOutputPath, audioBuffer);
|
||||||
|
|
||||||
|
const cost = text.length;
|
||||||
|
|
||||||
|
if (speedFactor !== 1.0) {
|
||||||
|
execSync(`ffmpeg -v error -i "${tempOutputPath}" -filter:a "atempo=${speedFactor}" -c:a libmp3lame -q:a 2 "${outputPath}" -y`);
|
||||||
|
fs.unlinkSync(tempOutputPath);
|
||||||
|
} else {
|
||||||
|
fs.renameSync(tempOutputPath, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioDuration = getAudioDuration(outputPath);
|
||||||
|
|
||||||
|
return {
|
||||||
|
duration: audioDuration,
|
||||||
|
cost: cost
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('Google Cloud TTS error:', error.message);
|
||||||
|
execSync(`ffmpeg -v error -f lavfi -i anullsrc=r=24000:cl=mono -t 1 -q:a 9 -acodec libmp3lame "${outputPath}" -y`);
|
||||||
|
return {
|
||||||
|
duration: 1,
|
||||||
|
cost: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractLanguageCode(voiceName: string): string {
|
||||||
|
const parts = voiceName.split('-');
|
||||||
|
if (parts.length >= 2) {
|
||||||
|
return `${parts[0]}-${parts[1]}`;
|
||||||
|
}
|
||||||
|
return 'en-US';
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,2 +1,4 @@
|
|||||||
export * from './ttsProviderFactory';
|
export * from './ttsProviderFactory';
|
||||||
export * from './openAITTSProvider';
|
export * from './openAITTSProvider';
|
||||||
|
export * from './elevenLabsTTSProvider';
|
||||||
|
export * from './googleCloudTTSProvider';
|
||||||
@@ -41,8 +41,9 @@ export class OpenAITTSProvider implements TTSProvider {
|
|||||||
|
|
||||||
const mp3 = await this.openai.audio.speech.create({
|
const mp3 = await this.openai.audio.speech.create({
|
||||||
model: model,
|
model: model,
|
||||||
voice: voice as any, // Type casting to any to avoid type issues
|
voice: voice as any,
|
||||||
input: text
|
input: text,
|
||||||
|
...(options.instructions ? { instructions: options.instructions } : {})
|
||||||
});
|
});
|
||||||
|
|
||||||
// Cost calculation is based on character count
|
// Cost calculation is based on character count
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import { TTSProvider } from '../../interfaces';
|
import { TTSProvider } from '../../interfaces';
|
||||||
import { Config } from '../../config/config';
|
import { Config } from '../../config/config';
|
||||||
import { OpenAITTSProvider } from './openAITTSProvider';
|
import { OpenAITTSProvider } from './openAITTSProvider';
|
||||||
|
import { ElevenLabsTTSProvider } from './elevenLabsTTSProvider';
|
||||||
|
import { GoogleCloudTTSProvider } from './googleCloudTTSProvider';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for creating TTS providers
|
* Factory for creating TTS providers
|
||||||
@@ -17,6 +19,10 @@ export class TTSProviderFactory {
|
|||||||
switch (providerName) {
|
switch (providerName) {
|
||||||
case 'openai':
|
case 'openai':
|
||||||
return new OpenAITTSProvider(providerConfig);
|
return new OpenAITTSProvider(providerConfig);
|
||||||
|
case 'elevenlabs':
|
||||||
|
return new ElevenLabsTTSProvider(providerConfig);
|
||||||
|
case 'google':
|
||||||
|
return new GoogleCloudTTSProvider(providerConfig);
|
||||||
// Add other providers here
|
// Add other providers here
|
||||||
default:
|
default:
|
||||||
throw new Error(`TTS provider "${providerName}" not implemented.`);
|
throw new Error(`TTS provider "${providerName}" not implemented.`);
|
||||||
|
|||||||
@@ -2,3 +2,4 @@ export * from './visionProviderFactory';
|
|||||||
export * from './openAIVisionProvider';
|
export * from './openAIVisionProvider';
|
||||||
export * from './geminiVisionProvider';
|
export * from './geminiVisionProvider';
|
||||||
export * from './ollamaVisionProvider';
|
export * from './ollamaVisionProvider';
|
||||||
|
export * from './openRouterVisionProvider';
|
||||||
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
171
src/providers/vision/openRouterVisionProvider.ts
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
import fs from 'fs';
|
||||||
|
import axios, { AxiosInstance } from 'axios';
|
||||||
|
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||||
|
|
||||||
|
export class OpenRouterVisionProvider implements VisionProvider {
|
||||||
|
private config: VisionProviderConfig;
|
||||||
|
private axiosInstance: AxiosInstance;
|
||||||
|
|
||||||
|
constructor(config: VisionProviderConfig) {
|
||||||
|
this.config = config;
|
||||||
|
this.axiosInstance = axios.create({
|
||||||
|
baseURL: config.baseUrl || 'https://openrouter.ai/api/v1',
|
||||||
|
headers: {
|
||||||
|
'Authorization': `Bearer ${config.apiKey}`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'HTTP-Referer': 'https://github.com/anomalyco/aidio-description',
|
||||||
|
'X-Title': 'Aidio Description Generator'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||||
|
try {
|
||||||
|
const imageData = fs.readFileSync(imagePath);
|
||||||
|
const base64Image = imageData.toString('base64');
|
||||||
|
|
||||||
|
const response = await this.axiosInstance.post('/chat/completions', {
|
||||||
|
model: this.config.model,
|
||||||
|
temperature: 0.1,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: prompt },
|
||||||
|
{
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: {
|
||||||
|
url: `data:image/jpeg;base64,${base64Image}`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens: this.config.maxTokens || 300
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = response.data;
|
||||||
|
return {
|
||||||
|
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||||
|
usage: {
|
||||||
|
inputTokens: data.usage?.prompt_tokens || 0,
|
||||||
|
outputTokens: data.usage?.completion_tokens || 0,
|
||||||
|
totalTokens: data.usage?.total_tokens || 0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('OpenRouter describeImage error:', error.response?.data || error.message);
|
||||||
|
return {
|
||||||
|
description: 'Unable to describe this image.',
|
||||||
|
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||||
|
try {
|
||||||
|
const image1Data = fs.readFileSync(image1Path);
|
||||||
|
const image2Data = fs.readFileSync(image2Path);
|
||||||
|
const base64Image1 = image1Data.toString('base64');
|
||||||
|
const base64Image2 = image2Data.toString('base64');
|
||||||
|
|
||||||
|
const response = await this.axiosInstance.post('/chat/completions', {
|
||||||
|
model: this.config.model,
|
||||||
|
temperature: 0.1,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: prompt },
|
||||||
|
{
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: { url: `data:image/jpeg;base64,${base64Image1}` }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: { url: `data:image/jpeg;base64,${base64Image2}` }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens: this.config.maxTokens || 300
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = response.data;
|
||||||
|
return {
|
||||||
|
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||||
|
usage: {
|
||||||
|
inputTokens: data.usage?.prompt_tokens || 0,
|
||||||
|
outputTokens: data.usage?.completion_tokens || 0,
|
||||||
|
totalTokens: data.usage?.total_tokens || 0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('OpenRouter compareImages error:', error.response?.data || error.message);
|
||||||
|
return {
|
||||||
|
description: 'Unable to describe the differences between these images.',
|
||||||
|
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async describeBatch(
|
||||||
|
imagePaths: string[],
|
||||||
|
lastBatchContext: BatchContext,
|
||||||
|
prompt: string
|
||||||
|
): Promise<VisionResult> {
|
||||||
|
try {
|
||||||
|
const imagesBase64 = imagePaths.map(fp => {
|
||||||
|
const imageData = fs.readFileSync(fp);
|
||||||
|
return imageData.toString('base64');
|
||||||
|
});
|
||||||
|
|
||||||
|
const messages: any[] = [
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: [
|
||||||
|
{ type: 'text', text: prompt }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||||
|
messages.unshift({
|
||||||
|
role: 'system',
|
||||||
|
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
imagesBase64.forEach(base64 => {
|
||||||
|
messages[messages.length - 1].content.push({
|
||||||
|
type: 'image_url',
|
||||||
|
image_url: {
|
||||||
|
url: `data:image/jpeg;base64,${base64}`
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await this.axiosInstance.post('/chat/completions', {
|
||||||
|
model: this.config.model,
|
||||||
|
messages,
|
||||||
|
max_tokens: this.config.maxTokens || 300
|
||||||
|
});
|
||||||
|
|
||||||
|
const data = response.data;
|
||||||
|
return {
|
||||||
|
description: data.choices?.[0]?.message?.content?.trim() || 'No description generated.',
|
||||||
|
usage: {
|
||||||
|
inputTokens: data.usage?.prompt_tokens || 0,
|
||||||
|
outputTokens: data.usage?.completion_tokens || 0,
|
||||||
|
totalTokens: data.usage?.total_tokens || 0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} catch (error: any) {
|
||||||
|
console.error('OpenRouter describeBatch error:', error.response?.data || error.message);
|
||||||
|
return {
|
||||||
|
description: 'Unable to describe this batch of images.',
|
||||||
|
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,6 +3,7 @@ import { Config } from '../../config/config';
|
|||||||
import { OpenAIVisionProvider } from './openAIVisionProvider';
|
import { OpenAIVisionProvider } from './openAIVisionProvider';
|
||||||
import { GeminiVisionProvider } from './geminiVisionProvider';
|
import { GeminiVisionProvider } from './geminiVisionProvider';
|
||||||
import { OllamaVisionProvider } from './ollamaVisionProvider';
|
import { OllamaVisionProvider } from './ollamaVisionProvider';
|
||||||
|
import { OpenRouterVisionProvider } from './openRouterVisionProvider';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for creating vision AI providers
|
* Factory for creating vision AI providers
|
||||||
@@ -23,6 +24,8 @@ export class VisionProviderFactory {
|
|||||||
return new GeminiVisionProvider(providerConfig);
|
return new GeminiVisionProvider(providerConfig);
|
||||||
case "ollama":
|
case "ollama":
|
||||||
return new OllamaVisionProvider(providerConfig);
|
return new OllamaVisionProvider(providerConfig);
|
||||||
|
case 'openrouter':
|
||||||
|
return new OpenRouterVisionProvider(providerConfig);
|
||||||
// Add other providers here
|
// Add other providers here
|
||||||
default:
|
default:
|
||||||
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import { Config } from '../config/config';
|
|||||||
import { CostBreakdown } from '../interfaces';
|
import { CostBreakdown } from '../interfaces';
|
||||||
import { getVideoDuration } from './mediaUtils';
|
import { getVideoDuration } from './mediaUtils';
|
||||||
|
|
||||||
|
type TTSPricingModel = number | { inputTokens: number; outputTokens: number };
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Estimate the cost of generating audio descriptions for a video
|
* Estimate the cost of generating audio descriptions for a video
|
||||||
* @param videoFilePath - Path to the input video file
|
* @param videoFilePath - Path to the input video file
|
||||||
@@ -26,39 +28,54 @@ export async function estimateCost(
|
|||||||
|
|
||||||
if (settings.batchTimeMode) {
|
if (settings.batchTimeMode) {
|
||||||
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
totalUnits = Math.floor(videoDuration / settings.batchWindowDuration);
|
||||||
unitCostMultiplier = settings.framesInBatch; // Cost multiplier for batch mode
|
unitCostMultiplier = settings.framesInBatch;
|
||||||
unitType = "batches";
|
unitType = "batches";
|
||||||
} else {
|
} else {
|
||||||
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
totalUnits = Math.floor(videoDuration / settings.captureIntervalSeconds);
|
||||||
unitCostMultiplier = 1; // No multiplier for normal mode
|
unitCostMultiplier = 1;
|
||||||
unitType = "frames";
|
unitType = "frames";
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`Will process ${totalUnits} ${unitType}`);
|
console.log(`Will process ${totalUnits} ${unitType}`);
|
||||||
|
|
||||||
// Pricing constants (as of March 2025, update as needed)
|
// Pricing constants (per 1K units unless otherwise noted)
|
||||||
const pricing: {
|
const pricing: {
|
||||||
vision: Record<string, Record<string, { input: number; output: number }>>;
|
vision: Record<string, Record<string, { input: number; output: number }>>;
|
||||||
tts: Record<string, Record<string, number>>;
|
tts: Record<string, Record<string, TTSPricingModel>>;
|
||||||
} = {
|
} = {
|
||||||
vision: {
|
vision: {
|
||||||
openai: {
|
openai: {
|
||||||
'gpt-4o': {
|
'gpt-4o': { input: 0.0025, output: 0.01 },
|
||||||
input: 0.0025,
|
'gpt-5.4-mini': { input: 0.00015, output: 0.0006 },
|
||||||
output: 0.01
|
'gpt-4o-mini': { input: 0.00015, output: 0.0006 }
|
||||||
}
|
|
||||||
},
|
},
|
||||||
gemini: {
|
gemini: {
|
||||||
'gemini-pro-vision': {
|
'gemini-2.0-flash': { input: 0.0001, output: 0.0004 },
|
||||||
input: 0.0025,
|
'gemini-1.5-flash': { input: 0.000075, output: 0.0003 },
|
||||||
output: 0.0025
|
'gemini-1.5-pro': { input: 0.00125, output: 0.005 }
|
||||||
}
|
},
|
||||||
|
openrouter: {
|
||||||
|
'anthropic/claude-sonnet-4.5': { input: 0.003, output: 0.015 },
|
||||||
|
'anthropic/claude-3.5-sonnet': { input: 0.003, output: 0.015 },
|
||||||
|
'anthropic/claude-3-haiku': { input: 0.0008, output: 0.004 },
|
||||||
|
'google/gemini-2.0-flash-001': { input: 0.0001, output: 0.0004 }
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
tts: {
|
tts: {
|
||||||
openai: {
|
openai: {
|
||||||
'tts-1': 0.015,
|
'tts-1': 0.015,
|
||||||
'tts-1-hd': 0.030
|
'tts-1-hd': 0.030,
|
||||||
|
'gpt-4o-mini-tts': { inputTokens: 0.60, outputTokens: 12.00 }
|
||||||
|
},
|
||||||
|
elevenlabs: {
|
||||||
|
'eleven_multilingual_v2': 0.30,
|
||||||
|
'eleven_turbo_v2.5': 0.015
|
||||||
|
},
|
||||||
|
google: {
|
||||||
|
'chirp-hd': 0.016,
|
||||||
|
'wavenet': 0.016,
|
||||||
|
'neural2': 0.016,
|
||||||
|
'standard': 0.004
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -82,27 +99,27 @@ export async function estimateCost(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Estimated token counts
|
// Estimated token counts
|
||||||
const estimatedVisionInputTokens = 1000 * unitCostMultiplier; // Base tokens for the vision input
|
const estimatedVisionInputTokens = 1000 * unitCostMultiplier;
|
||||||
const estimatedPromptTokens = 100; // Tokens for the prompt text
|
const estimatedPromptTokens = 100;
|
||||||
const estimatedOutputTokensPerUnit = 75; // Average tokens for description output
|
const estimatedOutputTokensPerUnit = 75;
|
||||||
|
|
||||||
// Estimated character counts for TTS
|
// Estimated character counts for TTS
|
||||||
const estimatedCharsPerDescription = 200; // Average characters per description
|
const estimatedCharsPerDescription = 200;
|
||||||
|
|
||||||
// Calculate estimated costs for first unit
|
// Calculate estimated costs for first unit
|
||||||
const firstUnitCost = {
|
const firstUnitCost = {
|
||||||
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
visionInput: (estimatedVisionInputTokens + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||||
};
|
};
|
||||||
|
|
||||||
// For subsequent units, we need context (e.g., previous frames)
|
// For subsequent units, we need context (e.g., previous frames)
|
||||||
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2; // Less overhead in batch mode
|
const contextMultiplier = settings.batchTimeMode ? 1.2 : 2;
|
||||||
|
|
||||||
const subsequentUnitCost = {
|
const subsequentUnitCost = {
|
||||||
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
visionInput: (estimatedVisionInputTokens * contextMultiplier + estimatedPromptTokens) * (visionPricing?.input || 0) / 1000,
|
||||||
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
visionOutput: estimatedOutputTokensPerUnit * (visionPricing?.output || 0) / 1000,
|
||||||
tts: estimatedCharsPerDescription * (ttsPricing || 0) / 1000
|
tts: calculateTTSCost(estimatedCharsPerDescription, ttsPricing)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Calculate total costs
|
// Calculate total costs
|
||||||
@@ -141,10 +158,25 @@ export async function estimateCost(
|
|||||||
total: totalCost.toFixed(4)
|
total: totalCost.toFixed(4)
|
||||||
},
|
},
|
||||||
estimates: {
|
estimates: {
|
||||||
totalAPICallsToProviders: totalUnits * 2, // Vision + TTS for each unit
|
totalAPICallsToProviders: totalUnits * 2,
|
||||||
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60 // rough estimate, 3 seconds per unit
|
estimatedProcessingTimeMinutes: (totalUnits * 3) / 60
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
return costBreakdown;
|
return costBreakdown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function calculateTTSCost(charCount: number, pricing: TTSPricingModel | undefined): number {
|
||||||
|
if (!pricing) return 0;
|
||||||
|
|
||||||
|
if (typeof pricing === 'number') {
|
||||||
|
// Per-character pricing: cost per 1000 characters
|
||||||
|
return charCount * pricing / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-token pricing (e.g., gpt-4o-mini-tts): cost per 1M tokens
|
||||||
|
// Rough estimate: 1 char ≈ 0.25 tokens for English text
|
||||||
|
const estimatedInputTokens = charCount * 0.25;
|
||||||
|
const estimatedOutputTokens = charCount * 3; // audio output is token-heavy
|
||||||
|
return (estimatedInputTokens * pricing.inputTokens + estimatedOutputTokens * pricing.outputTokens) / 1000000;
|
||||||
|
}
|
||||||
|
|||||||
@@ -173,7 +173,8 @@ export async function generateAudioDescription(
|
|||||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||||
voice: settings.ttsVoice,
|
voice: settings.ttsVoice,
|
||||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||||
speedFactor: settings.ttsSpeedFactor
|
speedFactor: settings.ttsSpeedFactor,
|
||||||
|
instructions: settings.ttsInstructions
|
||||||
});
|
});
|
||||||
|
|
||||||
const audioDuration = ttsResult.duration;
|
const audioDuration = ttsResult.duration;
|
||||||
@@ -304,7 +305,8 @@ async function generateAudioDescriptionBatch(
|
|||||||
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
const ttsResult = await ttsProvider.textToSpeech(description, audioFilePath, {
|
||||||
voice: settings.ttsVoice,
|
voice: settings.ttsVoice,
|
||||||
model: settings.ttsProviders[settings.ttsProvider].model,
|
model: settings.ttsProviders[settings.ttsProvider].model,
|
||||||
speedFactor: settings.ttsSpeedFactor
|
speedFactor: settings.ttsSpeedFactor,
|
||||||
|
instructions: settings.ttsInstructions
|
||||||
});
|
});
|
||||||
|
|
||||||
const audioDuration = ttsResult.duration;
|
const audioDuration = ttsResult.duration;
|
||||||
|
|||||||
Reference in New Issue
Block a user