import fs from 'fs'; import { GoogleGenerativeAI } from '@google/generative-ai'; import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces'; /** * Google Gemini Vision Provider Implementation */ export class GeminiVisionProvider implements VisionProvider { private config: VisionProviderConfig; private genAI: GoogleGenerativeAI; private model: any; constructor(config: VisionProviderConfig) { this.config = config; this.genAI = new GoogleGenerativeAI(config.apiKey!); this.model = this.genAI.getGenerativeModel({ model: config.model }); } /** * Describe a single image * @param imagePath - Path to the image file * @param prompt - Prompt for the AI * @returns Description and usage stats */ async describeImage(imagePath: string, prompt: string): Promise { try { const imageData = fs.readFileSync(imagePath); const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension // Create a file part for the image const imagePart = { inlineData: { data: imageData.toString('base64'), mimeType } }; // Generate content using Gemini const result = await this.model.generateContent([prompt, imagePart]); const response = await result.response; const text = response.text(); // Gemini doesn't provide token usage information in the same way as OpenAI // We'll estimate based on prompt length and response length const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image const outputTokens = Math.ceil(text.length / 4); return { description: text, usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens } }; } catch (error) { console.error("Error describing image with Gemini:", error); return { description: "Unable to describe this image.", usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } }; } } /** * Compare two images and describe the differences * @param image1Path - Path to the first image * @param image2Path - Path to the second image * @param prompt - Prompt for the AI * @returns Description and usage stats */ async compareImages(image1Path: string, image2Path: string, prompt: string): Promise { try { const image1Data = fs.readFileSync(image1Path); const image2Data = fs.readFileSync(image2Path); const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension // Create file parts for both images const image1Part = { inlineData: { data: image1Data.toString('base64'), mimeType } }; const image2Part = { inlineData: { data: image2Data.toString('base64'), mimeType } }; // Generate content using Gemini with both images const result = await this.model.generateContent([prompt, image1Part, image2Part]); const response = await result.response; const text = response.text(); // Estimate token usage const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images const outputTokens = Math.ceil(text.length / 4); return { description: text, usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens } }; } catch (error) { console.error("Error comparing images with Gemini:", error); return { description: "Unable to describe the differences between these images.", usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } }; } } /** * Describe a batch of images * @param imagePaths - Array of paths to the images * @param lastBatchContext - Context from the previous batch * @param prompt - Prompt for the AI * @returns Description and usage stats */ async describeBatch( imagePaths: string[], lastBatchContext: BatchContext, prompt: string ): Promise { try { // Create a prompt that includes context from the last batch if available let contextualPrompt = prompt; if (lastBatchContext && lastBatchContext.lastDescription) { contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`; } // Create content parts array starting with the prompt const contentParts: any[] = [contextualPrompt]; // Add all images to the content parts for (const imagePath of imagePaths) { const imageData = fs.readFileSync(imagePath); const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension contentParts.push({ inlineData: { data: imageData.toString('base64'), mimeType } }); } // Generate content using Gemini with all images const result = await this.model.generateContent(contentParts); const response = await result.response; const text = response.text(); // Estimate token usage const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate const outputTokens = Math.ceil(text.length / 4); return { description: text, usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens } }; } catch (error) { console.error("Error describing batch of images with Gemini:", error); return { description: "Unable to describe this batch of images.", usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 } }; } } }