llama vision | image 2 image | | Search

The code imports various modules and functions, then defines an asynchronous function whiskImages that takes four arguments and handles different types of input for its first two arguments, subject and scene.

Run example

npm run import -- "whisk images"

whisk images

const fs = require('fs')
const { request } = require('gaxios')
const requestOllamaVision = importer.import("request ollama vision")
const selectModel = importer.import("select llm")
const {doStableRequest} = importer.import("stable diffusion request")
const {doImage2Image} = importer.import("image 2 image")
const {doBackgroundMask} = importer.import("mask image")
const {doInpaintMask} = importer.import("inpaint mask")
// TODO: use the above functions in combination to whisk together a set of images

async function whiskImages(subject, scene, style, short) {
  let promptModel = await selectModel(process.env.DEFAULT_MODEL || 'Default')

  let subjectString
  let subjectShort
  let base64_subject
  if(typeof subject == 'string') {
    if(subject.startsWith(', '')
      base64_subject = Buffer.from(subject, 'base64').toString('base64')
    } else if(subject.includes('://')) {
      let result = await request({
        url: subject,
        method: 'GET',
      })
      base64_subject = Buffer.from(await result.data.arrayBuffer()).toString('base64')
    } else if (!fs.existsSync(subject)) {
      subjectString = subject
    } else {
      base64_subject = fs.readFileSync(subject).toString('base64')
    }  
  } else if(subject) {
    base64_subject = subject.toString('base64')
  }


  let sceneString
  let sceneShort
  let base64_scene
  if(typeof scene == 'string') {
    if(scene.startsWith(', '')
      base64_scene = Buffer.from(scene, 'base64').toString('base64')
    } else if(scene.includes('://')) {
      let result = await request({
        url: scene,
        method: 'GET',
      })
      base64_scene = Buffer.from(await result.data.arrayBuffer()).toString('base64')
    } else if (!fs.existsSync(scene)) {
      sceneString = scene
    } else {
      base64_scene = fs.readFileSync(scene).toString('base64')
    }  
  } else if (scene) {
    base64_scene = scene.toString('base64')
  }


  let styleString
  let styleShort
  let base64_style
  if(typeof style == 'string') {
    if(style.startsWith(', '')
      base64_style = Buffer.from(style, 'base64').toString('base64')
    } else if(style.includes('://')) {
      let result = await request({
        url: style,
        method: 'GET',
      })
      base64_style = Buffer.from(await result.data.arrayBuffer()).toString('base64')
    } else if (!fs.existsSync(style)) {
      styleString = style
    } else {
      base64_style = fs.readFileSync(style).toString('base64')
    }  
  } else if(style) {
    base64_style = style.toString('base64')
  }

  // TODO: if passing in an image, ask ollama vision for a description, 
  //   if passing in a description use it to generate the next image
  if(!subjectString && base64_subject) {
    subjectString = await requestOllamaVision('data:image/png;base64,' + base64_subject, 'Describe the foreground subject of the image in one short sentence.')
  }
  if(short && subjectString) {
    subjectShort = await promptModel('Summarize this sentence into four or five words:\n' + subjectString + '\nOnly return the summary, no title or explanation.')
  }

  if(!sceneString && base64_scene) {
    sceneString = await requestOllamaVision('data:image/png;base64,' + base64_scene, 'Describe the scenery in the image in one short sentence.')
  }
  if(short && sceneString) {
    sceneShort = await promptModel('Summarize this sentence into four or five words:\n' + sceneString + '\nOnly return the summary, no title or explanation.')
  }

  if(!styleString && base64_style) {
    styleString = await requestOllamaVision('data:image/png;base64,' + base64_style, 'Describe the art style of image in one short sentence.')
  }
  if(short && styleString) {
    styleShort = await promptModel('Summarize this sentence into four or five words:\n' + styleString + '\nOnly return the summary, no title or explanation.')
  }

  // TODO: if no scene, only subject and style, then just call image 2 image
  if(!base64_scene && !base64_style && !base64_subject) {
    // no images passed in, send directly to image generator
    if(short)
      return await doStableRequest(subjectShort + '\n' + sceneShort + '\n' + styleShort)
    else
      return await doStableRequest(subjectString + '\n' + sceneString + '\n' + styleString)
  } else if (base64_subject && !sceneString) {
    // subject and style process only, pass directly to image 2 image
    if(short)
      return await doImage2Image('data:image/png;base64,' + base64_subject, subjectShort + (styleShort ? ('\n' + styleShort) : ''))
    else
      return await doImage2Image('data:image/png;base64,' + base64_subject, subjectString + (styleString ? ('\n' + styleString) : ''))
  } else if (base64_scene && !subjectString) {
    // scene and style only, pass to image 2 image
    if(short)
      return await doImage2Image('data:image/png;base64,' + base64_scene, sceneShort + (styleShort ? ('\n' + styleShort) : ''))
    else
      return await doImage2Image('data:image/png;base64,' + base64_scene, sceneString + (styleString ? ('\n' + styleString) : ''))
  } else if (base64_subject && sceneString) {
    // TODO: extract mask on subject
    let maskObject = await doBackgroundMask('data:image/png;base64,' + base64_subject)
    let base64_mask = maskObject.image.toString('base64')

    // TODO: combine subject with new scene
    let inpaintObject
    if(short)
      inpaintObject = await doInpaintMask(
        'data:image/png;base64,' + base64_subject, 
        'data:image/png;base64,' + base64_mask, 
        sceneShort)
    else
      inpaintObject = await doInpaintMask(
        'data:image/png;base64,' + base64_subject, 
        'data:image/png;base64,' + base64_mask, 
        sceneString)
    
    // Drop out early if there is no style specified, just do the proper inpainting
    if(!styleString) {
      return inpaintObject
    }

    let base64_inpaint = inpaintObject.image.toString('base64')
    

    // TODO: generate final image in new style
    if(short)
      return await doImage2Image('data:image/png;base64,' + base64_inpaint, styleShort + '\n' + subjectShort + '\n' + sceneShort)
    else
      return await doImage2Image('data:image/png;base64,' + base64_inpaint, styleString + '\n' + subjectString + '\n' + sceneString)
  } else {
    console.error('Missing components: ')
    return {}
  }
}

module.exports = whiskImages

What the code could have been:

const { request, getAuth } = require('gaxios');
const importer = require('importer');
const { doStableRequest, doImage2Image, doBackgroundMask, doInpaintMask } = importer.import([
 'stable diffusion request',
  'image 2 image',
 'mask image',
  'inpaint mask'
]);

// Function to get base64 image from URL or file
async function getImageAsBase64(imagePathOrUrl) {
  if (imagePathOrUrl.startsWith(', '');
  } else if (imagePathOrUrl.includes('://')) {
    const response = await request({
      url: imagePathOrUrl,
      method: 'GET',
    });
    return Buffer.from(await response.data.arrayBuffer()).toString('base64');
  } else if (!require('fs').existsSync(imagePathOrUrl)) {
    throw new Error(`File ${imagePathOrUrl} does not exist.`);
  } else {
    return require('fs').readFileSync(imagePathOrUrl).toString('base64');
  }
}

// Function to describe image using Ollama Vision
async function getImageDescription(imageBase64, descriptionPrompt) {
  const response = await requestOllamaVision('data:image/png;base64,' + imageBase64, descriptionPrompt);
  return response;
}

// Function to summarize a sentence using a model
async function summarizeSentence(sentence, model) {
  const response = await model(`Summarize this sentence into four or five words:\n${sentence}\nOnly return the summary, no title or explanation.`);
  return response;
}

// Main function to whisk images together
async function whiskImages(subject, scene, style, short) {
  // Get the model to use for summarization
  const model = await selectModel(process.env.DEFAULT_MODEL || 'Default');

  // Get base64 images from URLs or files
  let subjectBase64, sceneBase64, styleBase64;
  try {
    subjectBase64 = await getImageAsBase64(subject);
    sceneBase64 = await getImageAsBase64(scene);
    styleBase64 = await getImageAsBase64(style);
  } catch (error) {
    if (error.message.includes('does not exist')) {
      // If a file does not exist, try to get a description for the image from Ollama Vision
      const subjectDescription = await getImageDescription(subjectBase64 || subject, 'Describe the foreground subject of the image in one short sentence.');
      const sceneDescription = await getImageDescription(sceneBase64 || scene, 'Describe the scenery in the image in one short sentence.');
      const styleDescription = await getImageDescription(styleBase64 || style, 'Describe the art style of image in one short sentence.');
      subjectBase64 = await getImageAsBase64(subjectDescription);
      sceneBase64 = await getImageAsBase64(sceneDescription);
      styleBase64 = await getImageAsBase64(styleDescription);
    } else {
      throw error;
    }
  }

  // Summarize the descriptions
  let subjectSummary, sceneSummary, styleSummary;
  if (short) {
    subjectSummary = await summarizeSentence(subjectBase64, model);
    sceneSummary = await summarizeSentence(sceneBase64, model);
    styleSummary = await summarizeSentence(styleBase64, model);
  } else {
    subjectSummary = await summarizeSentence(subject, model);
    sceneSummary = await summarizeSentence(scene, model);
    styleSummary = await summarizeSentence(style, model);
  }

  // Determine which image generation route to take
  if (!sceneBase64 &&!styleBase64 &&!subjectBase64) {
    // No images provided, generate an image directly
    if (short) {
      return await doStableRequest(subjectSummary + '\n' + sceneSummary + '\n' + styleSummary);
    } else {
      return await doStableRequest(subject + '\n' + scene + '\n' + style);
    }
  } else if (subjectBase64 &&!sceneSummary) {
    // Only subject and style, generate an image 2 image
    if (short) {
      return await doImage2Image('data:image/png;base64,' + subjectBase64, subjectSummary + (styleSummary? ('\n' + styleSummary) : ''));
    } else {
      return await doImage2Image('data:image/png;base64,' + subjectBase64, subject + (style? ('\n' + style) : ''));
    }
  } else if (sceneBase64 &&!subjectSummary) {
    // Only scene and style, generate an image 2 image
    if (short) {
      return await doImage2Image('data:image/png;base64,' + sceneBase64, sceneSummary + (styleSummary? ('\n' + styleSummary) : ''));
    } else {
      return await doImage2Image('data:image/png;base64,' + sceneBase64, scene + (style? ('\n' + style) : ''));
    }
  } else if (subjectBase64 && sceneSummary) {
    // Combine subject and scene to generate a new image
    const mask = await doBackgroundMask('data:image/png;base64,' + subjectBase64);
    const inpaintImage = await doInpaintMask('data:image/png;base64,' + subjectBase64, mask.image.toString('base64'), sceneSummary);
    if (!styleSummary) {
      return inpaintImage;
    }
    return await doImage2Image('data:image/png;base64,' + inpaintImage.image, styleSummary + '\n' + subjectSummary + '\n' + sceneSummary);
  } else {
    console.error('Missing components:');
    return {};
  }
}

module.exports = whiskImages;

Code Breakdown

Importing Modules and Functions

The code starts by importing various modules and functions:

const fs = require('fs')
const { request } = require('gaxios')
const requestOllamaVision = importer.import('request ollama vision')
const selectModel = importer.import('select llm')
const {doStableRequest} = importer.import('stable diffusion request')
const {doImage2Image} = importer.import('image 2 image')
const {doBackgroundMask} = importer.import('mask image')
const {doInpaintMask} = importer.import('inpaint mask')

Function Definition: whiskImages

The whiskImages function takes four arguments: subject, scene, style, and short, and is defined as an asynchronous function:

async function whiskImages(subject, scene, style, short) {
  //...
}

Handling subject Input

The function handles the subject input in several cases:

  1. If subject is a string that starts with 'data:image/', it extracts the base64-encoded image data.
  2. If subject is a string that includes '://', it makes a GET request to the URL and extracts the base64-encoded image data.
  3. If subject is a string that does not exist as a file, it keeps the string value.
  4. If subject is not a string, it converts it to a base64-encoded string.

Handling scene Input

The function handles the scene input similarly to the subject input.

Error Handling

The function does not appear to include any error handling or logging.

Unused Variables

The variables sceneShort, sceneString, style, and short are declared but not used anywhere in the function.

Documentation

The code does not include any comments or documentation, making it difficult to understand its purpose or functionality.