Model details
Example usage
Llama 3.2 Vision Instruct uses the same messages
format as other Llama models, but with a new image
field. Note that while multi-turn conversations are supported, the model can only process one image per generation, which is supplied at the end of the array.
Input
1import requests
2
3# Replace the empty string with your model id below
4model_id = ""
5baseten_api_key = os.environ["BASETEN_API_KEY"]
6
7messages = [
8 {"role": "user", "content": [
9 {"type": "image"},
10 {"type": "text", "text": "Can you write a haiku about this image?"}
11 ]},
12]
13image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
14data = {
15 "messages": messages,
16 "image": image,
17 "stream": True,
18 "max_new_tokens": 512,
19 "temperature": 0.9
20}
21
22# Call model endpoint
23res = requests.post(
24 f"https://model-{model_id}.api.baseten.co/production/predict",
25 headers={"Authorization": f"Api-Key {baseten_api_key}"},
26 json=data,
27 stream=True
28)
29
30# Print the generated tokens as they get streamed
31for content in res.iter_content():
32 print(content.decode("utf-8"), end="", flush=True)
JSON output
1{
2 "id": "chat-b1e89c98a7294d9dbb9d5e7867d2cb7c",
3 "object": "chat.completion",
4 "created": 1727839150,
5 "model": "meta-llama/Llama-3.2-11B-Vision-Instruct",
6 "choices": [
7 {
8 "index": 0,
9 "message": {
10 "role": "assistant",
11 "content": "This image is a close-up photograph of a black Labrador puppy with floppy ears and a shiny, healthy coat, gazing up at the camera with large brown eyes.",
12 "tool_calls": []
13 },
14 "logprobs": null,
15 "finish_reason": "stop",
16 "stop_reason": null
17 }
18 ],
19 "usage": {
20 "prompt_tokens": 18,
21 "total_tokens": 52,
22 "completion_tokens": 34
23 },
24 "prompt_logprobs": null
25}