Add Google Gemini vision provider implementation

2025-03-13 00:32:52 +03:00
3 changed files with 202 additions and 1 deletions
--- a/index.js
+++ b/index.js
@@ -28,6 +28,11 @@ const defaultConfig = {
      model: "gpt-4o",
      maxTokens: 300
    },
+    gemini: {
+      apiKey: process.env.GOOGLE_API_KEY,
+      model: "gemini-2.0-flash",
+      maxTokens: 300
+    }
    // Add other vision providers here
  },
  
@@ -76,6 +81,8 @@ class VisionProviderFactory {
    switch (providerName) {
      case 'openai':
        return new OpenAIVisionProvider(providerConfig);
+      case 'gemini':
+        return new GeminiVisionProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`Vision provider "${providerName}" not implemented.`);
@@ -129,7 +136,6 @@ class OpenAIVisionProvider {

      const response = await this.openai.chat.completions.create({
        model: this.config.model,
-        temperature: 0.1,
        messages: [
          {
            role: "user",
@@ -288,6 +294,179 @@ class OpenAIVisionProvider {
  }
 }

+/**
+ * Google Gemini Vision Provider Implementation
+ */
+class GeminiVisionProvider {
+  constructor(config) {
+    this.config = config;
+    
+    // Import the Google Generative AI SDK
+    const { GoogleGenerativeAI } = require("@google/generative-ai");
+    
+    // Initialize the API
+    this.genAI = new GoogleGenerativeAI(config.apiKey);
+    this.model = this.genAI.getGenerativeModel({ model: config.model });
+  }
+
+  /**
+   * Describe a single image
+   * @param {string} imagePath - Path to the image file
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeImage(imagePath, prompt) {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create a file part for the image
+      const imagePart = {
+        inlineData: {
+          data: imageData.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini
+      const result = await this.model.generateContent([prompt, imagePart]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Gemini doesn't provide token usage information in the same way as OpenAI
+      // We'll estimate based on prompt length and response length
+      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing image with Gemini:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe the differences
+   * @param {string} image1Path - Path to the first image
+   * @param {string} image2Path - Path to the second image
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async compareImages(image1Path, image2Path, prompt) {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create file parts for both images
+      const image1Part = {
+        inlineData: {
+          data: image1Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      const image2Part = {
+        inlineData: {
+          data: image2Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini with both images
+      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error comparing images with Gemini:", error);
+      return {
+        description: "Unable to describe the differences between these images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param {string[]} imagePaths - Array of paths to the images
+   * @param {object} lastBatchContext - Context from the previous batch
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeBatch(imagePaths, lastBatchContext, prompt) {
+    try {
+      // Create a prompt that includes context from the last batch if available
+      let contextualPrompt = prompt;
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
+      }
+
+      // Create content parts array starting with the prompt
+      const contentParts = [contextualPrompt];
+      
+      // Add all images to the content parts
+      for (const imagePath of imagePaths) {
+        const imageData = fs.readFileSync(imagePath);
+        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+        
+        contentParts.push({
+          inlineData: {
+            data: imageData.toString('base64'),
+            mimeType
+          }
+        });
+      }
+
+      // Generate content using Gemini with all images
+      const result = await this.model.generateContent(contentParts);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing batch of images with Gemini:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
+
 /**
 * OpenAI TTS Provider Implementation
 */
@@ -1081,6 +1260,12 @@ function printStats(stats, settings) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
+      },
+      gemini: {
+        'gemini-pro-vision': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.0025   // per 1K output tokens
+        }
      }
      // Add other vision providers here
    },
@@ -1169,6 +1354,12 @@ async function estimateCost(videoFilePath, options = {}) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
+      },
+      gemini: {
+        'gemini-pro-vision': {
+          input: 0.0025,  // per 1K input tokens
+          output: 0.0025   // per 1K output tokens
+        }
      }
      // Add other vision providers here
    },
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,6 +8,7 @@
            "name": "video-audio-description-generator",
            "version": "1.0.0",
            "dependencies": {
+                "@google/generative-ai": "^0.24.0",
                "axios": "^1.6.2",
                "dotenv": "^16.3.1",
                "fluent-ffmpeg": "^2.1.2",
@@ -18,6 +19,14 @@
                "node": ">=14.0.0"
            }
        },
+        "node_modules/@google/generative-ai": {
+            "version": "0.24.0",
+            "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.0.tgz",
+            "integrity": "sha512-fnEITCGEB7NdX0BhoYZ/cq/7WPZ1QS5IzJJfC3Tg/OwkvBetMiVJciyaan297OvE4B9Jg1xvo0zIazX/9sGu1Q==",
+            "engines": {
+                "node": ">=18.0.0"
+            }
+        },
        "node_modules/@types/node": {
            "version": "18.19.79",
            "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz",
--- a/package.json
+++ b/package.json
@@ -7,6 +7,7 @@
        "start": "node index.js"
    },
    "dependencies": {
+        "@google/generative-ai": "^0.24.0",
        "axios": "^1.6.2",
        "dotenv": "^16.3.1",
        "fluent-ffmpeg": "^2.1.2",