Add Google Gemini vision provider implementation

2025-03-13 00:32:52 +03:00
parent 78730c2ce9
commit 8e6ca2b0e2
3 changed files with 202 additions and 0 deletions
--- a/index.js
+++ b/index.js
@@ -28,6 +28,11 @@ const defaultConfig = {
      model: "gpt-4o",
      maxTokens: 300
    },
+    gemini: {
+      apiKey: process.env.GOOGLE_API_KEY,
+      model: "gemini-2.0-flash",
+      maxTokens: 300
+    }
    // Add other vision providers here
  },
  
@@ -76,6 +81,8 @@ class VisionProviderFactory {
    switch (providerName) {
      case 'openai':
        return new OpenAIVisionProvider(providerConfig);
+      case 'gemini':
+        return new GeminiVisionProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`Vision provider "${providerName}" not implemented.`);
@@ -287,6 +294,179 @@ class OpenAIVisionProvider {
  }
 }

+/**
+ * Google Gemini Vision Provider Implementation
+ */
+class GeminiVisionProvider {
+  constructor(config) {
+    this.config = config;
+    
+    // Import the Google Generative AI SDK
+    const { GoogleGenerativeAI } = require("@google/generative-ai");
+    
+    // Initialize the API
+    this.genAI = new GoogleGenerativeAI(config.apiKey);
+    this.model = this.genAI.getGenerativeModel({ model: config.model });
+  }
+
+  /**
+   * Describe a single image
+   * @param {string} imagePath - Path to the image file
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeImage(imagePath, prompt) {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create a file part for the image
+      const imagePart = {
+        inlineData: {
+          data: imageData.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini
+      const result = await this.model.generateContent([prompt, imagePart]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Gemini doesn't provide token usage information in the same way as OpenAI
+      // We'll estimate based on prompt length and response length
+      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing image with Gemini:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe the differences
+   * @param {string} image1Path - Path to the first image
+   * @param {string} image2Path - Path to the second image
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async compareImages(image1Path, image2Path, prompt) {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create file parts for both images
+      const image1Part = {
+        inlineData: {
+          data: image1Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      const image2Part = {
+        inlineData: {
+          data: image2Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini with both images
+      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error comparing images with Gemini:", error);
+      return {
+        description: "Unable to describe the differences between these images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param {string[]} imagePaths - Array of paths to the images
+   * @param {object} lastBatchContext - Context from the previous batch
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeBatch(imagePaths, lastBatchContext, prompt) {
+    try {
+      // Create a prompt that includes context from the last batch if available
+      let contextualPrompt = prompt;
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
+      }
+
+      // Create content parts array starting with the prompt
+      const contentParts = [contextualPrompt];
+      
+      // Add all images to the content parts
+      for (const imagePath of imagePaths) {
+        const imageData = fs.readFileSync(imagePath);
+        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+        
+        contentParts.push({
+          inlineData: {
+            data: imageData.toString('base64'),
+            mimeType
+          }
+        });
+      }
+
+      // Generate content using Gemini with all images
+      const result = await this.model.generateContent(contentParts);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing batch of images with Gemini:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
+
 /**
 * OpenAI TTS Provider Implementation
 */
@@ -1080,6 +1260,12 @@ function printStats(stats, settings) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
+      },
+      gemini: {
+        'gemini-pro-vision': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.0025   // per 1K output tokens
+        }
      }
      // Add other vision providers here
    },
@@ -1168,6 +1354,12 @@ async function estimateCost(videoFilePath, options = {}) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
+      },
+      gemini: {
+        'gemini-pro-vision': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.0025   // per 1K output tokens
+        }
      }
      // Add other vision providers here
    },