llama vision | Cell 6 | start a bunch of llm rpc services | Search

Makes a request to the LLaMA Vision API with an optional image and prompt, returning the response message from the API. The function uses async/await syntax and assumes the LLaMA Vision API is running on http://localhost:11434/api/chat.

Run example

npm run import -- "ollama vision request"

ollama vision request

const { request } = require('gaxios')
const fs = require('fs')

async function requestOllamaVision(image, prompt) {
  if (!image) {
    console.error('image not set!')
    return
  }

  let base64_image
  if(typeof image == 'string') {
    if(image.startsWith(', '')
  
    if(image.includes('://')) {
      let result = await request({
        url: image,
        method: 'GET',
      })
      base64_image = Buffer.from(await result.data.arrayBuffer()).toString('base64')
    } else if (!fs.existsSync(image)) {
      base64_image = Buffer.from(image, 'base64').toString('base64')
    } else {
      base64_image = fs.readFileSync(image).toString('base64')
    }  
  } else {
    base64_image = image.toString('base64')
  }

  let result = await request({
    url: 'http://localhost:11434/api/chat',
    method: 'POST',
    headers: {
      'Content-Type': 'application/json'
    },
    data: JSON.stringify({
      "model": "llama3.2-vision",
      "stream": false,
      "messages": [
        {
          "role": "user",
          "content": prompt ? prompt : "Describe the image in great detail.",
          //"content": (
          //    "Extract all text from the image and return it as markdown.\n"
          //    "Do not describe the image or add extra text.\n"
          //    "Only return the text found in the image."
          //),
          "images": [base64_image]
        }
      ]
    })
  })
  //let buff = Buffer.from(result.data.images[0], 'base64');
  if(result.data && result.data.message)
    return result.data.message.content
  else
    return
}

module.exports = requestOllamaVision

What the code could have been:

const { google } = require('googleapis');
const fs = require('fs');
const fetch = require('isomorphic-fetch');

/**
 * Makes a request to the LLaMA Vision API.
 * 
 * @param {Buffer|string} image - The image to be processed.
 * @param {string} prompt - The prompt for the image description.
 * @returns {Promise} The response from the API.
 */
async function requestLlamaVision(image, prompt = 'Describe the image in great detail.') {
  // Check if the image is valid
  if (!image) {
    console.error('Image not set!');
    return;
  }

  let base64Image;
  if (typeof image ==='string') {
    try {
      // Try to parse the image as a base64 string
      if (image.startsWith(', '');
      } else if (image.includes('://')) {
        // If it's a URL, fetch the image
        const response = await fetch(image);
        base64Image = await response.arrayBuffer().then(buffer => Buffer.from(buffer).toString('base64'));
      } else {
        // If it's a local file, read it
        base64Image = fs.readFileSync(image).toString('base64');
      }
    } catch (error) {
      // If any of the above steps fail, return an error
      console.error('Error processing image:', error);
      return;
    }
  } else {
    // If it's a Buffer, convert it to base64
    base64Image = image.toString('base64');
  }

  try {
    // Make the request to the LLaMA Vision API
    const apiResponse = await google.chat('v1').messages.list({
      'parent': 'projects/-/locations/-/agents/-',
      'body': {
       'model': 'llama3.2-vision',
       'stream': false,
       'messages': [
          {
            'role': 'user',
            'content': prompt,
            'images': [base64Image]
          }
        ]
      }
    });
    const result = apiResponse.data.messages;
    return result[0].text;
  } catch (error) {
    // If the API request fails, return an error
    console.error('Error requesting LLaMA Vision API:', error);
    return;
  }
}

module.exports = requestLlamaVision;

Function: requestOllamaVision

Description

Makes a request to the LLaMA Vision API with an optional image and prompt.

Parameters

Returns

Throws

Dependencies

Notes