ollama vision request

Makes a request to the LLaMA Vision API with an optional image and prompt, returning the response message from the API. The function uses async/await syntax and assumes the LLaMA Vision API is running on http://localhost:11434/api/chat.

const { request } = require('gaxios') const fs = require('fs') async function requestOllamaVision(image, prompt) { if (!image) { console.error('image not set!') return } let base64_image if(typeof image == 'string') { if(image.startsWith('data:image/')) image = image.replace(/^data:image\/.*?;base64,/gi, '') if(image.includes('://')) { let result = await request({ url: image, method: 'GET', }) base64_image = Buffer.from(await result.data.arrayBuffer()).toString('base64') } else if (!fs.existsSync(image)) { base64_image = Buffer.from(image, 'base64').toString('base64') } else { base64_image = fs.readFileSync(image).toString('base64') } } else { base64_image = image.toString('base64') } let result = await request({ url: 'http://localhost:11434/api/chat', method: 'POST', headers: { 'Content-Type': 'application/json' }, data: JSON.stringify({ "model": "llama3.2-vision", "stream": false, "messages": [ { "role": "user", "content": prompt ? prompt : "Describe the image in great detail.", //"content": ( // "Extract all text from the image and return it as markdown.\n" // "Do not describe the image or add extra text.\n" // "Only return the text found in the image." //), "images": [base64_image] } ] }) }) //let buff = Buffer.from(result.data.images[0], 'base64'); if(result.data && result.data.message) return result.data.message.content else return } module.exports = requestOllamaVision

What the code could have been:

const { google } = require('googleapis');
const fs = require('fs');
const fetch = require('isomorphic-fetch');

/**
 * Makes a request to the LLaMA Vision API.
 * 
 * @param {Buffer|string} image - The image to be processed.
 * @param {string} prompt - The prompt for the image description.
 * @returns {Promise} The response from the API.
 */
async function requestLlamaVision(image, prompt = 'Describe the image in great detail.') {
  // Check if the image is valid
  if (!image) {
    console.error('Image not set!');
    return;
  }

  let base64Image;
  if (typeof image ==='string') {
    try {
      // Try to parse the image as a base64 string
      if (image.startsWith('data:image/')) {
        base64Image = image.replace(/^data:image\/.*?;base64,/gi, '');
      } else if (image.includes('://')) {
        // If it's a URL, fetch the image
        const response = await fetch(image);
        base64Image = await response.arrayBuffer().then(buffer => Buffer.from(buffer).toString('base64'));
      } else {
        // If it's a local file, read it
        base64Image = fs.readFileSync(image).toString('base64');
      }
    } catch (error) {
      // If any of the above steps fail, return an error
      console.error('Error processing image:', error);
      return;
    }
  } else {
    // If it's a Buffer, convert it to base64
    base64Image = image.toString('base64');
  }

  try {
    // Make the request to the LLaMA Vision API
    const apiResponse = await google.chat('v1').messages.list({
      'parent': 'projects/-/locations/-/agents/-',
      'body': {
       'model': 'llama3.2-vision',
       'stream': false,
       'messages': [
          {
            'role': 'user',
            'content': prompt,
            'images': [base64Image]
          }
        ]
      }
    });
    const result = apiResponse.data.messages;
    return result[0].text;
  } catch (error) {
    // If the API request fails, return an error
    console.error('Error requesting LLaMA Vision API:', error);
    return;
  }
}

module.exports = requestLlamaVision;

Function: requestOllamaVision

Description

Makes a request to the LLaMA Vision API with an optional image and prompt.

Parameters

image: The image to be processed. Can be a:

Buffer object
string representing a base64 encoded image
string representing a URL to an image (will be downloaded and processed)
string representing a local file path to an image (will be read and processed)

prompt: The prompt to be sent to the LLaMA Vision API. Defaults to "Describe the image in great detail."

Returns

string: The response message from the LLaMA Vision API.

Throws

Error: If the image is not set or cannot be processed.

Dependencies

gaxios for making HTTP requests

fs for reading local files

Notes

The function uses the async/await syntax to handle promises.

The request function from gaxios is used to make HTTP requests.

The function assumes that the LLaMA Vision API is running on http://localhost:11434/api/chat.

The function returns the response message from the LLaMA Vision API, or an empty string if the response is invalid.

Run example

ollama vision request

What the code could have been:

Function: `requestOllamaVision`

Description

Parameters

Returns

Throws

Dependencies

Notes

Run example

ollama vision request

What the code could have been:

Function: requestOllamaVision

Description

Parameters

Returns

Throws

Dependencies

Notes

Function: `requestOllamaVision`