aidio-description/src/providers/vision/geminiVisionProvider.ts

import fs from 'fs';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';

/**
 * Google Gemini Vision Provider Implementation
 */
export class GeminiVisionProvider implements VisionProvider {
  private config: VisionProviderConfig;
  private genAI: GoogleGenerativeAI;
  private model: any;

  constructor(config: VisionProviderConfig) {
    this.config = config;
    this.genAI = new GoogleGenerativeAI(config.apiKey!);
    this.model = this.genAI.getGenerativeModel({ model: config.model });
  }

  /**
   * Describe a single image
   * @param imagePath - Path to the image file
   * @param prompt - Prompt for the AI
   * @returns Description and usage stats
   */
  async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
    try {
      const imageData = fs.readFileSync(imagePath);
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension

      // Create a file part for the image
      const imagePart = {
        inlineData: {
          data: imageData.toString('base64'),
          mimeType
        }
      };

      // Generate content using Gemini
      const result = await this.model.generateContent([prompt, imagePart]);
      const response = await result.response;
      const text = response.text();

      // Gemini doesn't provide token usage information in the same way as OpenAI
      // We'll estimate based on prompt length and response length
      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
      const outputTokens = Math.ceil(text.length / 4);

      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error describing image with Gemini:", error);
      return {
        description: "Unable to describe this image.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }

  /**
   * Compare two images and describe the differences
   * @param image1Path - Path to the first image
   * @param image2Path - Path to the second image
   * @param prompt - Prompt for the AI
   * @returns Description and usage stats
   */
  async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
    try {
      const image1Data = fs.readFileSync(image1Path);
      const image2Data = fs.readFileSync(image2Path);
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension

      // Create file parts for both images
      const image1Part = {
        inlineData: {
          data: image1Data.toString('base64'),
          mimeType
        }
      };

      const image2Part = {
        inlineData: {
          data: image2Data.toString('base64'),
          mimeType
        }
      };

      // Generate content using Gemini with both images
      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
      const response = await result.response;
      const text = response.text();

      // Estimate token usage
      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
      const outputTokens = Math.ceil(text.length / 4);

      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error comparing images with Gemini:", error);
      return {
        description: "Unable to describe the differences between these images.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }

  /**
   * Describe a batch of images
   * @param imagePaths - Array of paths to the images
   * @param lastBatchContext - Context from the previous batch
   * @param prompt - Prompt for the AI
   * @returns Description and usage stats
   */
  async describeBatch(
    imagePaths: string[],
    lastBatchContext: BatchContext,
    prompt: string
  ): Promise<VisionResult> {
    try {
      // Create a prompt that includes context from the last batch if available
      let contextualPrompt = prompt;
      if (lastBatchContext && lastBatchContext.lastDescription) {
        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
      }

      // Create content parts array starting with the prompt
      const contentParts: any[] = [contextualPrompt];

      // Add all images to the content parts
      for (const imagePath of imagePaths) {
        const imageData = fs.readFileSync(imagePath);
        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension

        contentParts.push({
          inlineData: {
            data: imageData.toString('base64'),
            mimeType
          }
        });
      }

      // Generate content using Gemini with all images
      const result = await this.model.generateContent(contentParts);
      const response = await result.response;
      const text = response.text();

      // Estimate token usage
      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
      const outputTokens = Math.ceil(text.length / 4);

      return {
        description: text,
        usage: {
          inputTokens,
          outputTokens,
          totalTokens: inputTokens + outputTokens
        }
      };
    } catch (error) {
      console.error("Error describing batch of images with Gemini:", error);
      return {
        description: "Unable to describe this batch of images.",
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
      };
    }
  }
}