Merge branch 'other-providers'

Add Google Gemini vision provider implementation
2025-03-12 22:53:01 +01:00 · 2025-03-13 00:32:52 +03:00
3 changed files with 202 additions and 0 deletions
--- a/index.js
+++ b/index.js
@@ -28,6 +28,11 @@ const defaultConfig = {
      model: "gpt-4o",
      maxTokens: 300
    },
    gemini: {
      apiKey: process.env.GOOGLE_API_KEY,
      model: "gemini-2.0-flash",
      maxTokens: 300
    }
    // Add other vision providers here
  },
@@ -76,6 +81,8 @@ class VisionProviderFactory {
    switch (providerName) {
      case 'openai':
        return new OpenAIVisionProvider(providerConfig);
      case 'gemini':
        return new GeminiVisionProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`Vision provider "${providerName}" not implemented.`);
@@ -288,6 +295,179 @@ class OpenAIVisionProvider {
  }
 }
 /**
 * Google Gemini Vision Provider Implementation
 */
 class GeminiVisionProvider {
  constructor(config) {
    this.config = config;
    // Import the Google Generative AI SDK
    const { GoogleGenerativeAI } = require("@google/generative-ai");
    // Initialize the API
    this.genAI = new GoogleGenerativeAI(config.apiKey);
    this.model = this.genAI.getGenerativeModel({ model: config.model });
  }
  /**
   * Describe a single image
   * @param {string} imagePath - Path to the image file
   * @param {string} prompt - Prompt for the AI
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
   */
  async describeImage(imagePath, prompt) {
    try {
      const imageData = fs.readFileSync(imagePath);
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
      // Create a file part for the image
      const imagePart = {
        inlineData: {
          data: imageData.toString('base64'),
          mimeType
        }
      };
      // Generate content using Gemini
      const result = await this.model.generateContent([prompt, imagePart]);
      const response = await result.response;
      const text = response.text();
      // Gemini doesn't provide token usage information in the same way as OpenAI
      // We'll estimate based on prompt length and response length
      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
      const outputTokens = Math.ceil(text.length / 4);
      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error describing image with Gemini:", error);
      return {
        description: "Unable to describe this image.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }
  /**
   * Compare two images and describe the differences
   * @param {string} image1Path - Path to the first image
   * @param {string} image2Path - Path to the second image
   * @param {string} prompt - Prompt for the AI
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
   */
  async compareImages(image1Path, image2Path, prompt) {
    try {
      const image1Data = fs.readFileSync(image1Path);
      const image2Data = fs.readFileSync(image2Path);
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
      // Create file parts for both images
      const image1Part = {
        inlineData: {
          data: image1Data.toString('base64'),
          mimeType
        }
      };
      const image2Part = {
        inlineData: {
          data: image2Data.toString('base64'),
          mimeType
        }
      };
      // Generate content using Gemini with both images
      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
      const response = await result.response;
      const text = response.text();
      // Estimate token usage
      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
      const outputTokens = Math.ceil(text.length / 4);
      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error comparing images with Gemini:", error);
      return {
        description: "Unable to describe the differences between these images.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }
  /**
   * Describe a batch of images
   * @param {string[]} imagePaths - Array of paths to the images
   * @param {object} lastBatchContext - Context from the previous batch
   * @param {string} prompt - Prompt for the AI
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
   */
  async describeBatch(imagePaths, lastBatchContext, prompt) {
    try {
      // Create a prompt that includes context from the last batch if available
      let contextualPrompt = prompt;
      if (lastBatchContext && lastBatchContext.lastDescription) {
        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
      }
      // Create content parts array starting with the prompt
      const contentParts = [contextualPrompt];
      // Add all images to the content parts
      for (const imagePath of imagePaths) {
        const imageData = fs.readFileSync(imagePath);
        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
        contentParts.push({
          inlineData: {
            data: imageData.toString('base64'),
            mimeType
          }
        });
      }
      // Generate content using Gemini with all images
      const result = await this.model.generateContent(contentParts);
      const response = await result.response;
      const text = response.text();
      // Estimate token usage
      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
      const outputTokens = Math.ceil(text.length / 4);
      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error describing batch of images with Gemini:", error);
      return {
        description: "Unable to describe this batch of images.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }
 }
 /**
 * OpenAI TTS Provider Implementation
 */
@@ -1081,6 +1261,12 @@ function printStats(stats, settings) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
      },
      gemini: {
        'gemini-pro-vision': {
          input: 0.0025,  // per 1K input tokens
          output: 0.0025   // per 1K output tokens
        }
      }
      // Add other vision providers here
    },
@@ -1169,6 +1355,12 @@ async function estimateCost(videoFilePath, options = {}) {
          output: 0.01    // per 1K output tokens
        }
        // Add other OpenAI models here
      },
      gemini: {
        'gemini-pro-vision': {
          input: 0.0025,  // per 1K input tokens
          output: 0.0025   // per 1K output tokens
        }
      }
      // Add other vision providers here
    },
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,6 +8,7 @@
            "name": "video-audio-description-generator",
            "version": "1.0.0",
            "dependencies": {
                "@google/generative-ai": "^0.24.0",
                "axios": "^1.6.2",
                "dotenv": "^16.3.1",
                "fluent-ffmpeg": "^2.1.2",
@@ -18,6 +19,14 @@
                "node": ">=14.0.0"
            }
        },
        "node_modules/@google/generative-ai": {
            "version": "0.24.0",
            "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.0.tgz",
            "integrity": "sha512-fnEITCGEB7NdX0BhoYZ/cq/7WPZ1QS5IzJJfC3Tg/OwkvBetMiVJciyaan297OvE4B9Jg1xvo0zIazX/9sGu1Q==",
            "engines": {
                "node": ">=18.0.0"
            }
        },
        "node_modules/@types/node": {
            "version": "18.19.79",
            "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz",
--- a/package.json
+++ b/package.json
@@ -7,6 +7,7 @@
        "start": "node index.js"
    },
    "dependencies": {
        "@google/generative-ai": "^0.24.0",
        "axios": "^1.6.2",
        "dotenv": "^16.3.1",
        "fluent-ffmpeg": "^2.1.2",
Author	SHA1	Message	Date
Talon	6ffb3f45ba	Merge branch 'other-providers'	2025-03-12 22:53:01 +01:00
the-byte-bender	8e6ca2b0e2	Add Google Gemini vision provider implementation	2025-03-13 00:32:52 +03:00