diff --git a/index.js b/index.js index 2f112b8..8e46cb1 100644 --- a/index.js +++ b/index.js @@ -28,6 +28,11 @@ const defaultConfig = { model: "gpt-4o", maxTokens: 300 }, + gemini: { + apiKey: process.env.GOOGLE_API_KEY, + model: "gemini-2.0-flash", + maxTokens: 300 + } // Add other vision providers here }, @@ -76,6 +81,8 @@ class VisionProviderFactory { switch (providerName) { case 'openai': return new OpenAIVisionProvider(providerConfig); + case 'gemini': + return new GeminiVisionProvider(providerConfig); // Add other providers here default: throw new Error(`Vision provider "${providerName}" not implemented.`); @@ -288,6 +295,179 @@ class OpenAIVisionProvider { } } +/** + * Google Gemini Vision Provider Implementation + */ +class GeminiVisionProvider { + constructor(config) { + this.config = config; + + // Import the Google Generative AI SDK + const { GoogleGenerativeAI } = require("@google/generative-ai"); + + // Initialize the API + this.genAI = new GoogleGenerativeAI(config.apiKey); + this.model = this.genAI.getGenerativeModel({ model: config.model }); + } + + /** + * Describe a single image + * @param {string} imagePath - Path to the image file + * @param {string} prompt - Prompt for the AI + * @returns {Promise<{description: string, usage: object}>} Description and usage stats + */ + async describeImage(imagePath, prompt) { + try { + const imageData = fs.readFileSync(imagePath); + const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension + + // Create a file part for the image + const imagePart = { + inlineData: { + data: imageData.toString('base64'), + mimeType + } + }; + + // Generate content using Gemini + const result = await this.model.generateContent([prompt, imagePart]); + const response = await result.response; + const text = response.text(); + + // Gemini doesn't provide token usage information in the same way as OpenAI + // We'll estimate based on prompt length and response length + const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image + const outputTokens = Math.ceil(text.length / 4); + + return { + description: text, + usage: { + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens + } + }; + } catch (error) { + console.error("Error describing image with Gemini:", error); + return { + description: "Unable to describe this image.", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } + }; + } + } + + /** + * Compare two images and describe the differences + * @param {string} image1Path - Path to the first image + * @param {string} image2Path - Path to the second image + * @param {string} prompt - Prompt for the AI + * @returns {Promise<{description: string, usage: object}>} Description and usage stats + */ + async compareImages(image1Path, image2Path, prompt) { + try { + const image1Data = fs.readFileSync(image1Path); + const image2Data = fs.readFileSync(image2Path); + const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension + + // Create file parts for both images + const image1Part = { + inlineData: { + data: image1Data.toString('base64'), + mimeType + } + }; + + const image2Part = { + inlineData: { + data: image2Data.toString('base64'), + mimeType + } + }; + + // Generate content using Gemini with both images + const result = await this.model.generateContent([prompt, image1Part, image2Part]); + const response = await result.response; + const text = response.text(); + + // Estimate token usage + const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images + const outputTokens = Math.ceil(text.length / 4); + + return { + description: text, + usage: { + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens + } + }; + } catch (error) { + console.error("Error comparing images with Gemini:", error); + return { + description: "Unable to describe the differences between these images.", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } + }; + } + } + + /** + * Describe a batch of images + * @param {string[]} imagePaths - Array of paths to the images + * @param {object} lastBatchContext - Context from the previous batch + * @param {string} prompt - Prompt for the AI + * @returns {Promise<{description: string, usage: object}>} Description and usage stats + */ + async describeBatch(imagePaths, lastBatchContext, prompt) { + try { + // Create a prompt that includes context from the last batch if available + let contextualPrompt = prompt; + if (lastBatchContext && lastBatchContext.lastDescription) { + contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`; + } + + // Create content parts array starting with the prompt + const contentParts = [contextualPrompt]; + + // Add all images to the content parts + for (const imagePath of imagePaths) { + const imageData = fs.readFileSync(imagePath); + const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension + + contentParts.push({ + inlineData: { + data: imageData.toString('base64'), + mimeType + } + }); + } + + // Generate content using Gemini with all images + const result = await this.model.generateContent(contentParts); + const response = await result.response; + const text = response.text(); + + // Estimate token usage + const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate + const outputTokens = Math.ceil(text.length / 4); + + return { + description: text, + usage: { + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens + } + }; + } catch (error) { + console.error("Error describing batch of images with Gemini:", error); + return { + description: "Unable to describe this batch of images.", + usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } + }; + } + } +} + /** * OpenAI TTS Provider Implementation */ @@ -1081,6 +1261,12 @@ function printStats(stats, settings) { output: 0.01 // per 1K output tokens } // Add other OpenAI models here + }, + gemini: { + 'gemini-pro-vision': { + input: 0.0025, // per 1K input tokens + output: 0.0025 // per 1K output tokens + } } // Add other vision providers here }, @@ -1169,6 +1355,12 @@ async function estimateCost(videoFilePath, options = {}) { output: 0.01 // per 1K output tokens } // Add other OpenAI models here + }, + gemini: { + 'gemini-pro-vision': { + input: 0.0025, // per 1K input tokens + output: 0.0025 // per 1K output tokens + } } // Add other vision providers here }, diff --git a/package-lock.json b/package-lock.json index a55171f..a9728fb 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,6 +8,7 @@ "name": "video-audio-description-generator", "version": "1.0.0", "dependencies": { + "@google/generative-ai": "^0.24.0", "axios": "^1.6.2", "dotenv": "^16.3.1", "fluent-ffmpeg": "^2.1.2", @@ -18,6 +19,14 @@ "node": ">=14.0.0" } }, + "node_modules/@google/generative-ai": { + "version": "0.24.0", + "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.0.tgz", + "integrity": "sha512-fnEITCGEB7NdX0BhoYZ/cq/7WPZ1QS5IzJJfC3Tg/OwkvBetMiVJciyaan297OvE4B9Jg1xvo0zIazX/9sGu1Q==", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@types/node": { "version": "18.19.79", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz", diff --git a/package.json b/package.json index f579398..07d36f8 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,7 @@ "start": "node index.js" }, "dependencies": { + "@google/generative-ai": "^0.24.0", "axios": "^1.6.2", "dotenv": "^16.3.1", "fluent-ffmpeg": "^2.1.2",