Ollama vision provider

2025-03-24 20:23:13 +01:00 · 2025-03-24 20:23:13 +01:00 · 9425b4b256
parent 6ffb3f45ba
commit 9425b4b256
1 changed files with 151 additions and 4 deletions
--- a/index.js
+++ b/index.js
@ -20,8 +20,8 @@ const defaultConfig = {
  batchPrompt: "Describe the sequence of frames in this batch over time for someone who cannot see it. Focus on what happens, changes, or stands out visually during these seconds. Keep it to 1-3 concise sentences, avoiding words like 'in these frames'—just describe what's happening. Use context from the previous batch if relevant. Keep sentences short and concise. Avoid speculation or overly verbose or unnecessary sentences. Try not to use nested sentences and keep sentences short to help flow. This will be used for audio description and mixed back in with the video file later, so we need to maintain consistency and quick pacing. Avoid using phrases such as 'as evidenced by' or 'suggesting'. Only focus on describing the visual scene. Do not repeat information given in the previous prompt, and focus only on what has changed since that description. Avoid talking about the scene or sequence, simply focus on the action within these frames. The listener knows that this is a video, so we do not need to remind them. Also avoid overusing phrases such as 'the scene shifts', the shifting or perspective change should be evident from the description of the sequence itself.",
  
  // Vision AI settings
-  visionProvider: "openai",
-  visionModel: "gpt-4o",
+  visionProvider: "gemini",
+  visionModel: "gemini-2.0-flash",
  visionProviders: {
    openai: {
      apiKey: process.env.OPENAI_API_KEY,
@ -32,6 +32,12 @@ const defaultConfig = {
      apiKey: process.env.GOOGLE_API_KEY,
      model: "gemini-2.0-flash",
      maxTokens: 300
+    },
+    ollama: {
+      // Example config; adjust to match your local Ollama setup
+      baseUrl: "http://localhost:11434", // or wherever Ollama is hosted
+      model: "gemma3:12b",
+      maxTokens: 3000
    }
    // Add other vision providers here
  },
@ -50,8 +56,8 @@ const defaultConfig = {
  },
  
  // Video processing settings
-  outputDir: "/mnt/e/desc/output/",
-  tempDir: "/mnt/e/desc/temp/",
+  outputDir:   "./desc/output/",
+  tempDir: "./desc/tmp/",
  batchTimeMode: true,                // Whether to use the new batch time mode
  batchWindowDuration: 15,             // How many seconds each batch covers
  framesInBatch: 10,                   // How many frames to capture within each batch
@ -83,6 +89,8 @@ class VisionProviderFactory {
        return new OpenAIVisionProvider(providerConfig);
      case 'gemini':
        return new GeminiVisionProvider(providerConfig);
+      case "ollama":
+        return new OllamaVisionProvider(providerConfig);
      // Add other providers here
      default:
        throw new Error(`Vision provider "${providerName}" not implemented.`);
@ -468,6 +476,145 @@ class GeminiVisionProvider {
  }
 }

+/**
+ * Ollama Vision Provider Implementation
+ * See: https://github.com/ollama/ollama/blob/main/docs/api.md
+ */
+class OllamaVisionProvider {
+  constructor(config) {
+    this.config = config;
+    this.axiosInstance = axios.create({
+      baseURL: config.baseUrl || "http://localhost:11451",
+      headers: { "Content-Type": "application/json" }
+    });
+  }
+
+  /**
+   * Describe a single image
+   * @param {string} imagePath - Path to the image file
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeImage(imagePath, prompt) {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const base64Image = imageData.toString('base64');
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: prompt,
+        images: [base64Image],
+        stream: false,
+        options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.1
+        }
+      });
+
+      const combinedText = response.data.response || "";
+      return {
+        description: combinedText.trim(),
+        usage: {
+          inputTokens: 0,
+          outputTokens: 0,
+          totalTokens: 0
+        }
+      };
+    } catch (error) {
+      console.error("Ollama describeImage error:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe differences
+   * @param {string} image1Path - Path to the first image
+   * @param {string} image2Path - Path to the second image
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async compareImages(image1Path, image2Path, prompt) {
+    try {
+      const image1Data = fs.readFileSync(image1Path).toString('base64');
+      const image2Data = fs.readFileSync(image2Path).toString('base64');
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: prompt,
+        images: [image1Data, image2Data],
+        stream: false,
+        options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.2
+        }
+      });
+
+      const combinedText = response.data.response || "";
+      return {
+        description: combinedText.trim(),
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    } catch (error) {
+      console.error("Ollama compareImages error:", error);
+      return {
+        description: "Unable to describe the differences.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param {string[]} imagePaths - Array of paths to the images
+   * @param {object} lastBatchContext - Context from the previous batch (optional)
+   * @param {string} prompt - Prompt for the AI
+   * @returns {Promise<{description: string, usage: object}>} Description and usage stats
+   */
+  async describeBatch(imagePaths, lastBatchContext, prompt) {
+    try {
+      let userPrompt = prompt;
+
+      // If there's context, prepend it. This helps maintain a storyline across batches.
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        userPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
+      }
+
+      // Convert images to base64
+      const imagesBase64 = imagePaths.map(fp => {
+        const imageData = fs.readFileSync(fp);
+        return imageData.toString('base64');
+      });
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: userPrompt,
+        images: imagesBase64,
+        stream: false,
+        /*options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.2
+        }*/
+      }, {
+        timeout: 120000 // Timeout in milliseconds, e.g., 5000 ms = 5 seconds
+      });      const combinedText = response.data.response || "";
+
+      return {
+        description: combinedText.trim(),
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    } catch (error) {
+      console.error("Ollama describeBatch error:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
+
 /**
 * OpenAI TTS Provider Implementation
 */