Compare commits
	
		
			1 Commits
		
	
	
		
			5312410d7e
			...
			other-prov
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 8e6ca2b0e2 | 
							
								
								
									
										193
									
								
								index.js
									
									
									
									
									
								
							
							
						
						
									
										193
									
								
								index.js
									
									
									
									
									
								
							@@ -28,6 +28,11 @@ const defaultConfig = {
 | 
			
		||||
      model: "gpt-4o",
 | 
			
		||||
      maxTokens: 300
 | 
			
		||||
    },
 | 
			
		||||
    gemini: {
 | 
			
		||||
      apiKey: process.env.GOOGLE_API_KEY,
 | 
			
		||||
      model: "gemini-2.0-flash",
 | 
			
		||||
      maxTokens: 300
 | 
			
		||||
    }
 | 
			
		||||
    // Add other vision providers here
 | 
			
		||||
  },
 | 
			
		||||
  
 | 
			
		||||
@@ -76,6 +81,8 @@ class VisionProviderFactory {
 | 
			
		||||
    switch (providerName) {
 | 
			
		||||
      case 'openai':
 | 
			
		||||
        return new OpenAIVisionProvider(providerConfig);
 | 
			
		||||
      case 'gemini':
 | 
			
		||||
        return new GeminiVisionProvider(providerConfig);
 | 
			
		||||
      // Add other providers here
 | 
			
		||||
      default:
 | 
			
		||||
        throw new Error(`Vision provider "${providerName}" not implemented.`);
 | 
			
		||||
@@ -129,7 +136,6 @@ class OpenAIVisionProvider {
 | 
			
		||||
 | 
			
		||||
      const response = await this.openai.chat.completions.create({
 | 
			
		||||
        model: this.config.model,
 | 
			
		||||
        temperature: 0.1,
 | 
			
		||||
        messages: [
 | 
			
		||||
          {
 | 
			
		||||
            role: "user",
 | 
			
		||||
@@ -288,6 +294,179 @@ class OpenAIVisionProvider {
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Google Gemini Vision Provider Implementation
 | 
			
		||||
 */
 | 
			
		||||
class GeminiVisionProvider {
 | 
			
		||||
  constructor(config) {
 | 
			
		||||
    this.config = config;
 | 
			
		||||
    
 | 
			
		||||
    // Import the Google Generative AI SDK
 | 
			
		||||
    const { GoogleGenerativeAI } = require("@google/generative-ai");
 | 
			
		||||
    
 | 
			
		||||
    // Initialize the API
 | 
			
		||||
    this.genAI = new GoogleGenerativeAI(config.apiKey);
 | 
			
		||||
    this.model = this.genAI.getGenerativeModel({ model: config.model });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /**
 | 
			
		||||
   * Describe a single image
 | 
			
		||||
   * @param {string} imagePath - Path to the image file
 | 
			
		||||
   * @param {string} prompt - Prompt for the AI
 | 
			
		||||
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
 | 
			
		||||
   */
 | 
			
		||||
  async describeImage(imagePath, prompt) {
 | 
			
		||||
    try {
 | 
			
		||||
      const imageData = fs.readFileSync(imagePath);
 | 
			
		||||
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
 | 
			
		||||
 | 
			
		||||
      // Create a file part for the image
 | 
			
		||||
      const imagePart = {
 | 
			
		||||
        inlineData: {
 | 
			
		||||
          data: imageData.toString('base64'),
 | 
			
		||||
          mimeType
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      // Generate content using Gemini
 | 
			
		||||
      const result = await this.model.generateContent([prompt, imagePart]);
 | 
			
		||||
      const response = await result.response;
 | 
			
		||||
      const text = response.text();
 | 
			
		||||
 | 
			
		||||
      // Gemini doesn't provide token usage information in the same way as OpenAI
 | 
			
		||||
      // We'll estimate based on prompt length and response length
 | 
			
		||||
      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
 | 
			
		||||
      const outputTokens = Math.ceil(text.length / 4);
 | 
			
		||||
 | 
			
		||||
      return {
 | 
			
		||||
        description: text,
 | 
			
		||||
        usage: {
 | 
			
		||||
          inputTokens,
 | 
			
		||||
          outputTokens,
 | 
			
		||||
          totalTokens: inputTokens + outputTokens
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      console.error("Error describing image with Gemini:", error);
 | 
			
		||||
      return {
 | 
			
		||||
        description: "Unable to describe this image.",
 | 
			
		||||
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /**
 | 
			
		||||
   * Compare two images and describe the differences
 | 
			
		||||
   * @param {string} image1Path - Path to the first image
 | 
			
		||||
   * @param {string} image2Path - Path to the second image
 | 
			
		||||
   * @param {string} prompt - Prompt for the AI
 | 
			
		||||
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
 | 
			
		||||
   */
 | 
			
		||||
  async compareImages(image1Path, image2Path, prompt) {
 | 
			
		||||
    try {
 | 
			
		||||
      const image1Data = fs.readFileSync(image1Path);
 | 
			
		||||
      const image2Data = fs.readFileSync(image2Path);
 | 
			
		||||
      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
 | 
			
		||||
 | 
			
		||||
      // Create file parts for both images
 | 
			
		||||
      const image1Part = {
 | 
			
		||||
        inlineData: {
 | 
			
		||||
          data: image1Data.toString('base64'),
 | 
			
		||||
          mimeType
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const image2Part = {
 | 
			
		||||
        inlineData: {
 | 
			
		||||
          data: image2Data.toString('base64'),
 | 
			
		||||
          mimeType
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      // Generate content using Gemini with both images
 | 
			
		||||
      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
 | 
			
		||||
      const response = await result.response;
 | 
			
		||||
      const text = response.text();
 | 
			
		||||
 | 
			
		||||
      // Estimate token usage
 | 
			
		||||
      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
 | 
			
		||||
      const outputTokens = Math.ceil(text.length / 4);
 | 
			
		||||
 | 
			
		||||
      return {
 | 
			
		||||
        description: text,
 | 
			
		||||
        usage: {
 | 
			
		||||
          inputTokens,
 | 
			
		||||
          outputTokens,
 | 
			
		||||
          totalTokens: inputTokens + outputTokens
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      console.error("Error comparing images with Gemini:", error);
 | 
			
		||||
      return {
 | 
			
		||||
        description: "Unable to describe the differences between these images.",
 | 
			
		||||
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /**
 | 
			
		||||
   * Describe a batch of images
 | 
			
		||||
   * @param {string[]} imagePaths - Array of paths to the images
 | 
			
		||||
   * @param {object} lastBatchContext - Context from the previous batch
 | 
			
		||||
   * @param {string} prompt - Prompt for the AI
 | 
			
		||||
   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
 | 
			
		||||
   */
 | 
			
		||||
  async describeBatch(imagePaths, lastBatchContext, prompt) {
 | 
			
		||||
    try {
 | 
			
		||||
      // Create a prompt that includes context from the last batch if available
 | 
			
		||||
      let contextualPrompt = prompt;
 | 
			
		||||
      if (lastBatchContext && lastBatchContext.lastDescription) {
 | 
			
		||||
        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Create content parts array starting with the prompt
 | 
			
		||||
      const contentParts = [contextualPrompt];
 | 
			
		||||
      
 | 
			
		||||
      // Add all images to the content parts
 | 
			
		||||
      for (const imagePath of imagePaths) {
 | 
			
		||||
        const imageData = fs.readFileSync(imagePath);
 | 
			
		||||
        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
 | 
			
		||||
        
 | 
			
		||||
        contentParts.push({
 | 
			
		||||
          inlineData: {
 | 
			
		||||
            data: imageData.toString('base64'),
 | 
			
		||||
            mimeType
 | 
			
		||||
          }
 | 
			
		||||
        });
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Generate content using Gemini with all images
 | 
			
		||||
      const result = await this.model.generateContent(contentParts);
 | 
			
		||||
      const response = await result.response;
 | 
			
		||||
      const text = response.text();
 | 
			
		||||
 | 
			
		||||
      // Estimate token usage
 | 
			
		||||
      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
 | 
			
		||||
      const outputTokens = Math.ceil(text.length / 4);
 | 
			
		||||
 | 
			
		||||
      return {
 | 
			
		||||
        description: text,
 | 
			
		||||
        usage: {
 | 
			
		||||
          inputTokens,
 | 
			
		||||
          outputTokens,
 | 
			
		||||
          totalTokens: inputTokens + outputTokens
 | 
			
		||||
        }
 | 
			
		||||
      };
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      console.error("Error describing batch of images with Gemini:", error);
 | 
			
		||||
      return {
 | 
			
		||||
        description: "Unable to describe this batch of images.",
 | 
			
		||||
        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * OpenAI TTS Provider Implementation
 | 
			
		||||
 */
 | 
			
		||||
@@ -1081,6 +1260,12 @@ function printStats(stats, settings) {
 | 
			
		||||
          output: 0.01    // per 1K output tokens
 | 
			
		||||
        }
 | 
			
		||||
        // Add other OpenAI models here
 | 
			
		||||
      },
 | 
			
		||||
      gemini: {
 | 
			
		||||
        'gemini-pro-vision': {
 | 
			
		||||
          input: 0.0025,  // per 1K input tokens
 | 
			
		||||
          output: 0.0025   // per 1K output tokens
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      // Add other vision providers here
 | 
			
		||||
    },
 | 
			
		||||
@@ -1169,6 +1354,12 @@ async function estimateCost(videoFilePath, options = {}) {
 | 
			
		||||
          output: 0.01    // per 1K output tokens
 | 
			
		||||
        }
 | 
			
		||||
        // Add other OpenAI models here
 | 
			
		||||
      },
 | 
			
		||||
      gemini: {
 | 
			
		||||
        'gemini-pro-vision': {
 | 
			
		||||
          input: 0.0025,  // per 1K input tokens
 | 
			
		||||
          output: 0.0025   // per 1K output tokens
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      // Add other vision providers here
 | 
			
		||||
    },
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										9
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										9
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							@@ -8,6 +8,7 @@
 | 
			
		||||
            "name": "video-audio-description-generator",
 | 
			
		||||
            "version": "1.0.0",
 | 
			
		||||
            "dependencies": {
 | 
			
		||||
                "@google/generative-ai": "^0.24.0",
 | 
			
		||||
                "axios": "^1.6.2",
 | 
			
		||||
                "dotenv": "^16.3.1",
 | 
			
		||||
                "fluent-ffmpeg": "^2.1.2",
 | 
			
		||||
@@ -18,6 +19,14 @@
 | 
			
		||||
                "node": ">=14.0.0"
 | 
			
		||||
            }
 | 
			
		||||
        },
 | 
			
		||||
        "node_modules/@google/generative-ai": {
 | 
			
		||||
            "version": "0.24.0",
 | 
			
		||||
            "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.0.tgz",
 | 
			
		||||
            "integrity": "sha512-fnEITCGEB7NdX0BhoYZ/cq/7WPZ1QS5IzJJfC3Tg/OwkvBetMiVJciyaan297OvE4B9Jg1xvo0zIazX/9sGu1Q==",
 | 
			
		||||
            "engines": {
 | 
			
		||||
                "node": ">=18.0.0"
 | 
			
		||||
            }
 | 
			
		||||
        },
 | 
			
		||||
        "node_modules/@types/node": {
 | 
			
		||||
            "version": "18.19.79",
 | 
			
		||||
            "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.79.tgz",
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@
 | 
			
		||||
        "start": "node index.js"
 | 
			
		||||
    },
 | 
			
		||||
    "dependencies": {
 | 
			
		||||
        "@google/generative-ai": "^0.24.0",
 | 
			
		||||
        "axios": "^1.6.2",
 | 
			
		||||
        "dotenv": "^16.3.1",
 | 
			
		||||
        "fluent-ffmpeg": "^2.1.2",
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user