In this notebook, we show how to use MistralAI MultiModal LLM class/abstraction for image understanding/reasoning.
We demonstrate following functions that are supported for MistralAI Pixtral Multimodal LLM:
complete
(both sync and async): for a single prompt and list of imagesstream complete
(both sync and async): for steaming output of complete
%pip install llama-index-multi-modal-llms-mistralai
%pip install matplotlib
import os
from IPython.display import Markdown, display
os.environ[
"MISTRAL_API_KEY"
] = "<YOUR API KEY>" # Your MistralAI API token here
Initialize MistralAIMultiModal
from llama_index.multi_modal_llms.mistralai import MistralAIMultiModal
mistralai_mm_llm = MistralAIMultiModal(
model="pixtral-12b-2409", max_new_tokens=300
)
Load Images from URLs
from llama_index.core.multi_modal_llms.generic_utils import load_image_urls
image_urls = [
"https://tripfixers.com/wp-content/uploads/2019/11/eiffel-tower-with-snow.jpeg",
"https://cdn.statcdn.com/Infographic/images/normal/30322.jpeg",
]
image_documents = load_image_urls(image_urls)
First Image
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
img_response = requests.get(image_urls[0], headers=headers)
print(image_urls[0])
img = Image.open(BytesIO(img_response.content))
plt.imshow(img)
Second Image
img_response = requests.get(image_urls[1], headers=headers)
print(image_urls[1])
img = Image.open(BytesIO(img_response.content))
plt.imshow(img)
Complete a prompt with a bunch of images
complete_response = mistralai_mm_llm.complete(
prompt="Describe the images as an alternative text in a few words",
image_documents=image_documents,
)
display(Markdown(f"{complete_response}"))
Steam Complete a prompt with a bunch of images
stream_complete_response = mistralai_mm_llm.stream_complete(
prompt="give me more context for this images in a few words",
image_documents=image_documents,
)
for r in stream_complete_response:
print(r.delta, end="")
Async Complete
response_acomplete = await mistralai_mm_llm.acomplete(
prompt="Describe the images as an alternative text in a few words",
image_documents=image_documents,
)
display(Markdown(f"{response_acomplete}"))
Async Steam Complete
response_astream_complete = await mistralai_mm_llm.astream_complete(
prompt="Describe the images as an alternative text in a few words",
image_documents=image_documents,
)
async for delta in response_astream_complete:
print(delta.delta, end="")
Complete with Two images
image_urls = [
"https://tripfixers.com/wp-content/uploads/2019/11/eiffel-tower-with-snow.jpeg",
"https://assets.visitorscoverage.com/production/wp-content/uploads/2024/04/AdobeStock_626542468-min-1024x683.jpeg",
]
Lets Inspect the images.
First Image
img_response = requests.get(image_urls[0], headers=headers)
print(image_urls[0])
img = Image.open(BytesIO(img_response.content))
plt.imshow(img)
Second Image
img_response = requests.get(image_urls[1], headers=headers)
print(image_urls[1])
img = Image.open(BytesIO(img_response.content))
plt.imshow(img)
image_documents_compare = load_image_urls(image_urls)
response_multi = mistralai_mm_llm.complete(
prompt="What are the differences between two images?",
image_documents=image_documents_compare,
)
display(Markdown(f"{response_multi}"))
Load Images from local files
!wget 'https://www.boredpanda.com/blog/wp-content/uploads/2022/11/interesting-receipts-102-6364c8d181c6a__700.jpg' -O 'receipt.jpg'
from PIL import Image
import matplotlib.pyplot as plt
img = Image.open("./receipt.jpg")
plt.imshow(img)
from llama_index.core import SimpleDirectoryReader
# put your local directore here
image_documents = SimpleDirectoryReader(
input_files=["./receipt.jpg"]
).load_data()
response = mistralai_mm_llm.complete(
prompt="Transcribe the text in the image",
image_documents=image_documents,
)
display(Markdown(f"{response}"))