WIP typescript conversion
This commit is contained in:
186
src/providers/vision/geminiVisionProvider.ts
Normal file
186
src/providers/vision/geminiVisionProvider.ts
Normal file
@@ -0,0 +1,186 @@
|
||||
import fs from 'fs';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
type GoogleGenerativeAI = any;
|
||||
type GenerativeModel = any;
|
||||
|
||||
/**
|
||||
* Google Gemini Vision Provider Implementation
|
||||
*/
|
||||
export class GeminiVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private genAI: GoogleGenerativeAI;
|
||||
private model: GenerativeModel;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
|
||||
// Import the Google Generative AI SDK
|
||||
const { GoogleGenerativeAI } = require("@google/generative-ai");
|
||||
|
||||
// Initialize the API
|
||||
this.genAI = new GoogleGenerativeAI(config.apiKey);
|
||||
this.model = this.genAI.getGenerativeModel({ model: config.model });
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
// Create a file part for the image
|
||||
const imagePart = {
|
||||
inlineData: {
|
||||
data: imageData.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
// Generate content using Gemini
|
||||
const result = await this.model.generateContent([prompt, imagePart]);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Gemini doesn't provide token usage information in the same way as OpenAI
|
||||
// We'll estimate based on prompt length and response length
|
||||
const inputTokens = Math.ceil(prompt.length / 4) + 1000; // rough estimate for image
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing image with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe the differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
// Create file parts for both images
|
||||
const image1Part = {
|
||||
inlineData: {
|
||||
data: image1Data.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
const image2Part = {
|
||||
inlineData: {
|
||||
data: image2Data.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
};
|
||||
|
||||
// Generate content using Gemini with both images
|
||||
const result = await this.model.generateContent([prompt, image1Part, image2Part]);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Estimate token usage
|
||||
const inputTokens = Math.ceil(prompt.length / 4) + 2000; // rough estimate for two images
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error comparing images with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences between these images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
// Create a prompt that includes context from the last batch if available
|
||||
let contextualPrompt = prompt;
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
contextualPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
|
||||
}
|
||||
|
||||
// Create content parts array starting with the prompt
|
||||
const contentParts: any[] = [contextualPrompt];
|
||||
|
||||
// Add all images to the content parts
|
||||
for (const imagePath of imagePaths) {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const mimeType = 'image/jpeg'; // Assuming JPEG, could be detected based on file extension
|
||||
|
||||
contentParts.push({
|
||||
inlineData: {
|
||||
data: imageData.toString('base64'),
|
||||
mimeType
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Generate content using Gemini with all images
|
||||
const result = await this.model.generateContent(contentParts);
|
||||
const response = await result.response;
|
||||
const text = response.text();
|
||||
|
||||
// Estimate token usage
|
||||
const inputTokens = Math.ceil(contextualPrompt.length / 4) + (1000 * imagePaths.length); // rough estimate
|
||||
const outputTokens = Math.ceil(text.length / 4);
|
||||
|
||||
return {
|
||||
description: text,
|
||||
usage: {
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing batch of images with Gemini:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
4
src/providers/vision/index.ts
Normal file
4
src/providers/vision/index.ts
Normal file
@@ -0,0 +1,4 @@
|
||||
export * from './visionProviderFactory';
|
||||
export * from './openAIVisionProvider';
|
||||
export * from './geminiVisionProvider';
|
||||
export * from './ollamaVisionProvider';
|
||||
151
src/providers/vision/ollamaVisionProvider.ts
Normal file
151
src/providers/vision/ollamaVisionProvider.ts
Normal file
@@ -0,0 +1,151 @@
|
||||
import fs from 'fs';
|
||||
import axios, { AxiosInstance } from 'axios';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
/**
|
||||
* Ollama Vision Provider Implementation
|
||||
* See: https://github.com/ollama/ollama/blob/main/docs/api.md
|
||||
*/
|
||||
export class OllamaVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private axiosInstance: AxiosInstance;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.axiosInstance = axios.create({
|
||||
baseURL: config.baseUrl || "http://localhost:11434",
|
||||
headers: { "Content-Type": "application/json" }
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [base64Image],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.1
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: {
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
totalTokens: 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeImage error:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path).toString('base64');
|
||||
const image2Data = fs.readFileSync(image2Path).toString('base64');
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: prompt,
|
||||
images: [image1Data, image2Data],
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama compareImages error:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch (optional)
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
let userPrompt = prompt;
|
||||
|
||||
// If there's context, prepend it. This helps maintain a storyline across batches.
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
userPrompt = `Previous batch summary: ${lastBatchContext.lastDescription}\n\n${prompt}`;
|
||||
}
|
||||
|
||||
// Convert images to base64
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
const response = await this.axiosInstance.post('/api/generate', {
|
||||
model: this.config.model,
|
||||
prompt: userPrompt,
|
||||
images: imagesBase64,
|
||||
stream: false,
|
||||
options: {
|
||||
max_tokens: this.config.maxTokens || 300,
|
||||
temperature: 0.2
|
||||
}
|
||||
}, {
|
||||
timeout: 120000 // Timeout in milliseconds, e.g., 5000 ms = 5 seconds
|
||||
});
|
||||
|
||||
const combinedText = response.data.response || "";
|
||||
|
||||
return {
|
||||
description: combinedText.trim(),
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Ollama describeBatch error:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
193
src/providers/vision/openAIVisionProvider.ts
Normal file
193
src/providers/vision/openAIVisionProvider.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
import fs from 'fs';
|
||||
import { OpenAI } from 'openai';
|
||||
import { VisionProvider, VisionProviderConfig, VisionResult, BatchContext } from '../../interfaces';
|
||||
|
||||
/**
|
||||
* OpenAI Vision Provider Implementation
|
||||
*/
|
||||
export class OpenAIVisionProvider implements VisionProvider {
|
||||
private config: VisionProviderConfig;
|
||||
private openai: OpenAI;
|
||||
|
||||
constructor(config: VisionProviderConfig) {
|
||||
this.config = config;
|
||||
this.openai = new OpenAI({
|
||||
apiKey: config.apiKey,
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a single image
|
||||
* @param imagePath - Path to the image file
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeImage(imagePath: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const imageData = fs.readFileSync(imagePath);
|
||||
const base64Image = imageData.toString('base64');
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing image:", error);
|
||||
return {
|
||||
description: "Unable to describe this image.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two images and describe the differences
|
||||
* @param image1Path - Path to the first image
|
||||
* @param image2Path - Path to the second image
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async compareImages(image1Path: string, image2Path: string, prompt: string): Promise<VisionResult> {
|
||||
try {
|
||||
const image1Data = fs.readFileSync(image1Path);
|
||||
const image2Data = fs.readFileSync(image2Path);
|
||||
|
||||
const base64Image1 = image1Data.toString('base64');
|
||||
const base64Image2 = image2Data.toString('base64');
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt },
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image1}`
|
||||
}
|
||||
},
|
||||
{
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64Image2}`
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error comparing images:", error);
|
||||
return {
|
||||
description: "Unable to describe the differences between these images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe a batch of images
|
||||
* @param imagePaths - Array of paths to the images
|
||||
* @param lastBatchContext - Context from the previous batch
|
||||
* @param prompt - Prompt for the AI
|
||||
* @returns Description and usage stats
|
||||
*/
|
||||
async describeBatch(
|
||||
imagePaths: string[],
|
||||
lastBatchContext: BatchContext,
|
||||
prompt: string
|
||||
): Promise<VisionResult> {
|
||||
try {
|
||||
// Convert images to base64
|
||||
const imagesBase64 = imagePaths.map(fp => {
|
||||
const imageData = fs.readFileSync(fp);
|
||||
return imageData.toString('base64');
|
||||
});
|
||||
|
||||
// Build the messages array for the chat completion
|
||||
const messages: any[] = [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{ type: "text", text: prompt }
|
||||
]
|
||||
}
|
||||
];
|
||||
|
||||
// If we have some text context from the last batch, inject that as well
|
||||
if (lastBatchContext && lastBatchContext.lastDescription) {
|
||||
messages.unshift({
|
||||
role: "system",
|
||||
content: `Previous batch summary: ${lastBatchContext.lastDescription}`
|
||||
});
|
||||
}
|
||||
|
||||
// Append each image in the new batch
|
||||
imagesBase64.forEach(base64 => {
|
||||
messages[messages.length - 1].content.push({
|
||||
type: "image_url",
|
||||
image_url: {
|
||||
url: `data:image/jpeg;base64,${base64}`
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const response = await this.openai.chat.completions.create({
|
||||
model: this.config.model,
|
||||
messages,
|
||||
max_tokens: this.config.maxTokens || 300
|
||||
});
|
||||
|
||||
return {
|
||||
description: response.choices[0].message.content?.trim() || "No description generated.",
|
||||
usage: {
|
||||
inputTokens: response.usage?.prompt_tokens || 0,
|
||||
outputTokens: response.usage?.completion_tokens || 0,
|
||||
totalTokens: response.usage?.total_tokens || 0
|
||||
}
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error describing batch of images:", error);
|
||||
return {
|
||||
description: "Unable to describe this batch of images.",
|
||||
usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
31
src/providers/vision/visionProviderFactory.ts
Normal file
31
src/providers/vision/visionProviderFactory.ts
Normal file
@@ -0,0 +1,31 @@
|
||||
import { VisionProvider } from '../../interfaces';
|
||||
import { Config } from '../../config/config';
|
||||
import { OpenAIVisionProvider } from './openAIVisionProvider';
|
||||
import { GeminiVisionProvider } from './geminiVisionProvider';
|
||||
import { OllamaVisionProvider } from './ollamaVisionProvider';
|
||||
|
||||
/**
|
||||
* Factory for creating vision AI providers
|
||||
*/
|
||||
export class VisionProviderFactory {
|
||||
static getProvider(config: Config): VisionProvider {
|
||||
const providerName = config.visionProvider;
|
||||
const providerConfig = config.visionProviders[providerName];
|
||||
|
||||
if (!providerConfig) {
|
||||
throw new Error(`Vision provider "${providerName}" not configured.`);
|
||||
}
|
||||
|
||||
switch (providerName) {
|
||||
case 'openai':
|
||||
return new OpenAIVisionProvider(providerConfig);
|
||||
case 'gemini':
|
||||
return new GeminiVisionProvider(providerConfig);
|
||||
case "ollama":
|
||||
return new OllamaVisionProvider(providerConfig);
|
||||
// Add other providers here
|
||||
default:
|
||||
throw new Error(`Vision provider "${providerName}" not implemented.`);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user