WIP typescript conversion

2025-06-10 19:24:13 +02:00
parent 9425b4b256
commit 507d4f6474
26 changed files with 2128 additions and 27 deletions
--- a/src/providers/vision/geminiVisionProvider.ts
+++ b/src/providers/vision/geminiVisionProvider.ts
@@ -0,0 +1,186 @@
+import fs from 'fs';
+import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
+
+type GoogleGenerativeAI = any;
+type GenerativeModel = any;
+
+/**
+ * Google Gemini Vision Provider Implementation
+ */
+export class GeminiVisionProvider implements VisionProvider {
+  private config: VisionProviderConfig;
+  private genAI: GoogleGenerativeAI;
+  private model: GenerativeModel;
+
+  constructor(config: VisionProviderConfig) {
+    this.config = config;
+    
+    // Import the Google Generative AI SDK
+    const { GoogleGenerativeAI } = require("@google/generative-ai");
+    
+    // Initialize the API
+    this.genAI = new GoogleGenerativeAI(config.apiKey);
+    this.model = this.genAI.getGenerativeModel({ model: config.model });
+  }
+
+  /**
+   * Describe a single image
+   * @param imagePath - Path to the image file
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create a file part for the image
+      const imagePart = {
+        inlineData: {
+          data: imageData.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini
+      const result = await this.model.generateContent([prompt, imagePart]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Gemini doesn't provide token usage information in the same way as OpenAI
+      // We'll estimate based on prompt length and response length
+      const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing image with Gemini:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe the differences
+   * @param image1Path - Path to the first image
+   * @param image2Path - Path to the second image
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+      const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+
+      // Create file parts for both images
+      const image1Part = {
+        inlineData: {
+          data: image1Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      const image2Part = {
+        inlineData: {
+          data: image2Data.toString('base64'),
+          mimeType
+        }
+      };
+
+      // Generate content using Gemini with both images
+      const result = await this.model.generateContent([prompt, image1Part, image2Part]);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error comparing images with Gemini:", error);
+      return {
+        description: "Unable to describe the differences between these images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param imagePaths - Array of paths to the images
+   * @param lastBatchContext - Context from the previous batch
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeBatch(
+    imagePaths: string[], 
+    lastBatchContext: BatchContext, 
+    prompt: string
+  ): Promise<VisionResult> {
+    try {
+      // Create a prompt that includes context from the last batch if available
+      let contextualPrompt = prompt;
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
+      }
+
+      // Create content parts array starting with the prompt
+      const contentParts: any[] = [contextualPrompt];
+      
+      // Add all images to the content parts
+      for (const imagePath of imagePaths) {
+        const imageData = fs.readFileSync(imagePath);
+        const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
+        
+        contentParts.push({
+          inlineData: {
+            data: imageData.toString('base64'),
+            mimeType
+          }
+        });
+      }
+
+      // Generate content using Gemini with all images
+      const result = await this.model.generateContent(contentParts);
+      const response = await result.response;
+      const text = response.text();
+
+      // Estimate token usage
+      const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
+      const outputTokens = Math.ceil(text.length / 4);
+
+      return {
+        description: text,
+        usage: {
+          inputTokens,
+          outputTokens,
+          totalTokens: inputTokens + outputTokens
+        }
+      };
+    } catch (error) {
+      console.error("Error describing batch of images with Gemini:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
--- a/src/providers/vision/index.ts
+++ b/src/providers/vision/index.ts
@@ -0,0 +1,4 @@
+export * from './visionProviderFactory';
+export * from './openAIVisionProvider';
+export * from './geminiVisionProvider';
+export * from './ollamaVisionProvider';
--- a/src/providers/vision/ollamaVisionProvider.ts
+++ b/src/providers/vision/ollamaVisionProvider.ts
@@ -0,0 +1,151 @@
+import fs from 'fs';
+import axios, { AxiosInstance } from 'axios';
+import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
+
+/**
+ * Ollama Vision Provider Implementation
+ * See: https://github.com/ollama/ollama/blob/main/docs/api.md
+ */
+export class OllamaVisionProvider implements VisionProvider {
+  private config: VisionProviderConfig;
+  private axiosInstance: AxiosInstance;
+
+  constructor(config: VisionProviderConfig) {
+    this.config = config;
+    this.axiosInstance = axios.create({
+      baseURL: config.baseUrl || "http://localhost:11434",
+      headers: { "Content-Type": "application/json" }
+    });
+  }
+
+  /**
+   * Describe a single image
+   * @param imagePath - Path to the image file
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const base64Image = imageData.toString('base64');
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: prompt,
+        images: [base64Image],
+        stream: false,
+        options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.1
+        }
+      });
+
+      const combinedText = response.data.response || "";
+      return {
+        description: combinedText.trim(),
+        usage: {
+          inputTokens: 0,
+          outputTokens: 0,
+          totalTokens: 0
+        }
+      };
+    } catch (error) {
+      console.error("Ollama describeImage error:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe differences
+   * @param image1Path - Path to the first image
+   * @param image2Path - Path to the second image
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
+    try {
+      const image1Data = fs.readFileSync(image1Path).toString('base64');
+      const image2Data = fs.readFileSync(image2Path).toString('base64');
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: prompt,
+        images: [image1Data, image2Data],
+        stream: false,
+        options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.2
+        }
+      });
+
+      const combinedText = response.data.response || "";
+      return {
+        description: combinedText.trim(),
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    } catch (error) {
+      console.error("Ollama compareImages error:", error);
+      return {
+        description: "Unable to describe the differences.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param imagePaths - Array of paths to the images
+   * @param lastBatchContext - Context from the previous batch (optional)
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeBatch(
+    imagePaths: string[], 
+    lastBatchContext: BatchContext, 
+    prompt: string
+  ): Promise<VisionResult> {
+    try {
+      let userPrompt = prompt;
+
+      // If there's context, prepend it. This helps maintain a storyline across batches.
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        userPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
+      }
+
+      // Convert images to base64
+      const imagesBase64 = imagePaths.map(fp => {
+        const imageData = fs.readFileSync(fp);
+        return imageData.toString('base64');
+      });
+
+      const response = await this.axiosInstance.post('/api/generate', {
+        model: this.config.model,
+        prompt: userPrompt,
+        images: imagesBase64,
+        stream: false,
+        options: {
+          max_tokens: this.config.maxTokens || 300,
+          temperature: 0.2
+        }
+      }, {
+        timeout: 120000 // Timeout in milliseconds, e.g., 5000 ms = 5 seconds
+      });
+      
+      const combinedText = response.data.response || "";
+
+      return {
+        description: combinedText.trim(),
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    } catch (error) {
+      console.error("Ollama describeBatch error:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
--- a/src/providers/vision/openAIVisionProvider.ts
+++ b/src/providers/vision/openAIVisionProvider.ts
@@ -0,0 +1,193 @@
+import fs from 'fs';
+import { OpenAI } from 'openai';
+import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
+
+/**
+ * OpenAI Vision Provider Implementation
+ */
+export class OpenAIVisionProvider implements VisionProvider {
+  private config: VisionProviderConfig;
+  private openai: OpenAI;
+
+  constructor(config: VisionProviderConfig) {
+    this.config = config;
+    this.openai = new OpenAI({
+      apiKey: config.apiKey,
+    });
+  }
+
+  /**
+   * Describe a single image
+   * @param imagePath - Path to the image file
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
+    try {
+      const imageData = fs.readFileSync(imagePath);
+      const base64Image = imageData.toString('base64');
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        temperature: 0.1,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: prompt },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content?.trim() || "No description generated.",
+        usage: {
+          inputTokens: response.usage?.prompt_tokens || 0,
+          outputTokens: response.usage?.completion_tokens || 0,
+          totalTokens: response.usage?.total_tokens || 0
+        }
+      };
+    } catch (error) {
+      console.error("Error describing image:", error);
+      return {
+        description: "Unable to describe this image.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Compare two images and describe the differences
+   * @param image1Path - Path to the first image
+   * @param image2Path - Path to the second image
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
+    try {
+      const image1Data = fs.readFileSync(image1Path);
+      const image2Data = fs.readFileSync(image2Path);
+
+      const base64Image1 = image1Data.toString('base64');
+      const base64Image2 = image2Data.toString('base64');
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        messages: [
+          {
+            role: "user",
+            content: [
+              { type: "text", text: prompt },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image1}`
+                }
+              },
+              {
+                type: "image_url",
+                image_url: {
+                  url: `data:image/jpeg;base64,${base64Image2}`
+                }
+              }
+            ]
+          }
+        ],
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content?.trim() || "No description generated.",
+        usage: {
+          inputTokens: response.usage?.prompt_tokens || 0,
+          outputTokens: response.usage?.completion_tokens || 0,
+          totalTokens: response.usage?.total_tokens || 0
+        }
+      };
+    } catch (error) {
+      console.error("Error comparing images:", error);
+      return {
+        description: "Unable to describe the differences between these images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+
+  /**
+   * Describe a batch of images
+   * @param imagePaths - Array of paths to the images
+   * @param lastBatchContext - Context from the previous batch
+   * @param prompt - Prompt for the AI
+   * @returns Description and usage stats
+   */
+  async describeBatch(
+    imagePaths: string[], 
+    lastBatchContext: BatchContext, 
+    prompt: string
+  ): Promise<VisionResult> {
+    try {
+      // Convert images to base64
+      const imagesBase64 = imagePaths.map(fp => {
+        const imageData = fs.readFileSync(fp);
+        return imageData.toString('base64');
+      });
+
+      // Build the messages array for the chat completion
+      const messages: any[] = [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: prompt }
+          ]
+        }
+      ];
+
+      // If we have some text context from the last batch, inject that as well
+      if (lastBatchContext && lastBatchContext.lastDescription) {
+        messages.unshift({
+          role: "system",
+          content: `Previous batch summary: ${lastBatchContext.lastDescription}`
+        });
+      }
+
+      // Append each image in the new batch
+      imagesBase64.forEach(base64 => {
+        messages[messages.length - 1].content.push({
+          type: "image_url",
+          image_url: {
+            url: `data:image/jpeg;base64,${base64}`
+          }
+        });
+      });
+
+      const response = await this.openai.chat.completions.create({
+        model: this.config.model,
+        messages,
+        max_tokens: this.config.maxTokens || 300
+      });
+
+      return {
+        description: response.choices[0].message.content?.trim() || "No description generated.",
+        usage: {
+          inputTokens: response.usage?.prompt_tokens || 0,
+          outputTokens: response.usage?.completion_tokens || 0,
+          totalTokens: response.usage?.total_tokens || 0
+        }
+      };
+    } catch (error) {
+      console.error("Error describing batch of images:", error);
+      return {
+        description: "Unable to describe this batch of images.",
+        usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
+      };
+    }
+  }
+}
--- a/src/providers/vision/visionProviderFactory.ts
+++ b/src/providers/vision/visionProviderFactory.ts
@@ -0,0 +1,31 @@
+import { VisionProvider } from '../../interfaces';
+import { Config } from '../../config/config';
+import { OpenAIVisionProvider } from './openAIVisionProvider';
+import { GeminiVisionProvider } from './geminiVisionProvider';
+import { OllamaVisionProvider } from './ollamaVisionProvider';
+
+/**
+ * Factory for creating vision AI providers
+ */
+export class VisionProviderFactory {
+  static getProvider(config: Config): VisionProvider {
+    const providerName = config.visionProvider;
+    const providerConfig = config.visionProviders[providerName];
+    
+    if (!providerConfig) {
+      throw new Error(`Vision provider "${providerName}" not configured.`);
+    }
+    
+    switch (providerName) {
+      case 'openai':
+        return new OpenAIVisionProvider(providerConfig);
+      case 'gemini':
+        return new GeminiVisionProvider(providerConfig);
+      case "ollama":
+        return new OllamaVisionProvider(providerConfig);
+      // Add other providers here
+      default:
+        throw new Error(`Vision provider "${providerName}" not implemented.`);
+    }
+  }
+}