Add Google Gemini vision provider implementation

other-providers
the-byte-bender 2025-03-13 00:32:52 +03:00
parent 78730c2ce9
commit 8e6ca2b0e2
3 changed files with 202 additions and 0 deletions

192
index.js
View File

@ -28,6 +28,11 @@ const defaultConfig = {
model: "gpt-4o", model: "gpt-4o",
maxTokens: 300 maxTokens: 300
}, },
gemini: {
apiKey: process.env.GOOGLE_API_KEY,
model: "gemini-2.0-flash",
maxTokens: 300
}
// Add other vision providers here // Add other vision providers here
}, },
@ -76,6 +81,8 @@ class VisionProviderFactory {
switch (providerName) { switch (providerName) {
case 'openai': case 'openai':
return new OpenAIVisionProvider(providerConfig); return new OpenAIVisionProvider(providerConfig);
case 'gemini':
return new GeminiVisionProvider(providerConfig);
// Add other providers here // Add other providers here
default: default:
throw new Error(`Vision provider "${providerName}" not implemented.`); throw new Error(`Vision provider "${providerName}" not implemented.`);
@ -287,6 +294,179 @@ class OpenAIVisionProvider {
} }
} }
/**
* Google Gemini Vision Provider Implementation
*/
class GeminiVisionProvider {
constructor(config) {
this.config = config;
// Import the Google Generative AI SDK
const { GoogleGenerativeAI } = require("@google/generative-ai");
// Initialize the API
this.genAI = new GoogleGenerativeAI(config.apiKey);
this.model = this.genAI.getGenerativeModel({ model: config.model });
}
/**
* Describe a single image
* @param {string} imagePath - Path to the image file
* @param {string} prompt - Prompt for the AI
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
*/
async describeImage(imagePath, prompt) {
try {
const imageData = fs.readFileSync(imagePath);
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
// Create a file part for the image
const imagePart = {
inlineData: {
data: imageData.toString('base64'),
mimeType
}
};
// Generate content using Gemini
const result = await this.model.generateContent([prompt, imagePart]);
const response = await result.response;
const text = response.text();
// Gemini doesn't provide token usage information in the same way as OpenAI
// We'll estimate based on prompt length and response length
const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
const outputTokens = Math.ceil(text.length / 4);
return {
description: text,
usage: {
inputTokens,
outputTokens,
totalTokens: inputTokens + outputTokens
}
};
} catch (error) {
console.error("Error describing image with Gemini:", error);
return {
description: "Unable to describe this image.",
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
/**
* Compare two images and describe the differences
* @param {string} image1Path - Path to the first image
* @param {string} image2Path - Path to the second image
* @param {string} prompt - Prompt for the AI
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
*/
async compareImages(image1Path, image2Path, prompt) {
try {
const image1Data = fs.readFileSync(image1Path);
const image2Data = fs.readFileSync(image2Path);
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
// Create file parts for both images
const image1Part = {
inlineData: {
data: image1Data.toString('base64'),
mimeType
}
};
const image2Part = {
inlineData: {
data: image2Data.toString('base64'),
mimeType
}
};
// Generate content using Gemini with both images
const result = await this.model.generateContent([prompt, image1Part, image2Part]);
const response = await result.response;
const text = response.text();
// Estimate token usage
const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
const outputTokens = Math.ceil(text.length / 4);
return {
description: text,
usage: {
inputTokens,
outputTokens,
totalTokens: inputTokens + outputTokens
}
};
} catch (error) {
console.error("Error comparing images with Gemini:", error);
return {
description: "Unable to describe the differences between these images.",
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
/**
* Describe a batch of images
* @param {string[]} imagePaths - Array of paths to the images
* @param {object} lastBatchContext - Context from the previous batch
* @param {string} prompt - Prompt for the AI
* @returns {Promise<{description: string, usage: object}>} Description and usage stats
*/
async describeBatch(imagePaths, lastBatchContext, prompt) {
try {
// Create a prompt that includes context from the last batch if available
let contextualPrompt = prompt;
if (lastBatchContext && lastBatchContext.lastDescription) {
contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
}
// Create content parts array starting with the prompt
const contentParts = [contextualPrompt];
// Add all images to the content parts
for (const imagePath of imagePaths) {
const imageData = fs.readFileSync(imagePath);
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
contentParts.push({
inlineData: {
data: imageData.toString('base64'),
mimeType
}
});
}
// Generate content using Gemini with all images
const result = await this.model.generateContent(contentParts);
const response = await result.response;
const text = response.text();
// Estimate token usage
const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
const outputTokens = Math.ceil(text.length / 4);
return {
description: text,
usage: {
inputTokens,
outputTokens,
totalTokens: inputTokens + outputTokens
}
};
} catch (error) {
console.error("Error describing batch of images with Gemini:", error);
return {
description: "Unable to describe this batch of images.",
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
};
}
}
}
/** /**
* OpenAI TTS Provider Implementation * OpenAI TTS Provider Implementation
*/ */
@ -1080,6 +1260,12 @@ function printStats(stats, settings) {
output: 0.01 // per 1K output tokens output: 0.01 // per 1K output tokens
} }
// Add other OpenAI models here // Add other OpenAI models here
},
gemini: {
'gemini-pro-vision': {
input: 0.0025, // per 1K input tokens
output: 0.0025 // per 1K output tokens
}
} }
// Add other vision providers here // Add other vision providers here
}, },
@ -1168,6 +1354,12 @@ async function estimateCost(videoFilePath, options = {}) {
output: 0.01 // per 1K output tokens output: 0.01 // per 1K output tokens
} }
// Add other OpenAI models here // Add other OpenAI models here
},
gemini: {
'gemini-pro-vision': {
input: 0.0025, // per 1K input tokens
output: 0.0025 // per 1K output tokens
}
} }
// Add other vision providers here // Add other vision providers here
}, },

9
package-lock.json generated
View File

@ -8,6 +8,7 @@
"name": "video-audio-description-generator", "name": "video-audio-description-generator",
"version": "1.0.0", "version": "1.0.0",
"dependencies": { "dependencies": {
"@google/generative-ai": "^0.24.0",
"axios": "^1.6.2", "axios": "^1.6.2",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",
"fluent-ffmpeg": "^2.1.2", "fluent-ffmpeg": "^2.1.2",
@ -18,6 +19,14 @@
"node": ">=14.0.0" "node": ">=14.0.0"
} }
}, },
"node_modules/@google/generative-ai": {
"version": "0.24.0",
"resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.0.tgz",
"integrity": "sha512-fnEITCGEB7NdX0BhoYZ/cq/7WPZ1QS5IzJJfC3Tg/OwkvBetMiVJciyaan297OvE4B9Jg1xvo0zIazX/9sGu1Q==",
"engines": {
"node": ">=18.0.0"
}
},
"node_modules/@types/node": { "node_modules/@types/node": {
"version": "18.19.79", "version": "18.19.79",
"resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz",

View File

@ -7,6 +7,7 @@
"start": "node index.js" "start": "node index.js"
}, },
"dependencies": { "dependencies": {
"@google/generative-ai": "^0.24.0",
"axios": "^1.6.2", "axios": "^1.6.2",
"dotenv": "^16.3.1", "dotenv": "^16.3.1",
"fluent-ffmpeg": "^2.1.2", "fluent-ffmpeg": "^2.1.2",