8-10 DeepSeek VLM

Learning Objectives

Using a Python program to download the Janus model from the Hugging Face platform, and inputting multimodal data (text + images) to ask Janus questions and get answers.

What is Janus?

Janus‑Pro‑1B is a "multimodal large language model" developed by DeepSeek AI. It has the ability to understand and generate both text and images, meaning you can give it a sentence or a picture, and it can "comprehend" and then "continue speaking" or "create a drawing.”

What Can Janus Do?

1. Give it a picture, and it can describe it in words.

2. Give it a piece of text, and it can generate an image that matches the description.

How to Get Started?

1. Download the source code of the deepseek-ai/Janus project from GitHub.

git clone https://github.com/deepseek-ai/Janus.git

2. Open the pyproject.toml file in the Janus folder and delete the following dependencies.

3. Proceed with the installation.

cd Janus

pip install -e . --index-url https://pypi.jetson-ai-lab.io/jp6/cu126

cd ..

4. The following example code will make Janus generate a response to "Describe the image.”

import os
import csv
import cv2
import time
import torch
from PIL import Image
from transformers import AutoModelForCausalLM

from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images

# Specify the path of the model
model_path = "deepseek-ai/Janus-Pro-1B"
# Load the processor
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path, cache_dir="./model")
tokenizer = vl_chat_processor.tokenizer

# Load the model
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, cache_dir="./model",
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Set up the conversation messages, roles, and content
img_path = "Janus/images/doge.png"
conversation = [
    {
        "role": "<|User|>",
        "content": "\nDescribe the image.",
        "images": [img_path],
    },
    {"role": "<|Assistant|>", "content": ""},
]

# Load the image and prepare input
frame = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
pil_images = [Image.fromarray(frame)]
prepare_inputs = vl_chat_processor(
    conversations=conversation, images=pil_images, force_batchify=True
).to(vl_gpt.device)

# Run the image encoder to get image embedding data
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

# Run the model to get a response
outputs = vl_gpt.language_model.generate(
    inputs_embeds=inputs_embeds,
    attention_mask=prepare_inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=512,
    do_sample=False,
    use_cache=True,
)

# Decode the generated text and output the result
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
print(answer)

5. After running it, you will see a response similar to the following:

Once you're done, you can modify the questions and images entered in the program and use Janus to answer various questions!

6. The following example program will have Janus generate an image of "A stunning princess from Kabul in red and white traditional clothing, with blue eyes and brown hair," and save it in the generated_samples folder.

import os
import PIL.Image
import torch
import numpy as np
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor

# Specify the model path
model_path = "deepseek-ai/Janus-Pro-1B"
# Load the multimodal chat processor and specify cache directory
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(
    model_path, cache_dir="./model"
)
# Get the tokenizer
tokenizer = vl_chat_processor.tokenizer

# Load the multimodal causal language model, allow remote code, 
# specify cache directory
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True, cache_dir="./model"
)
# Convert model to bfloat16 type, move to CUDA GPU, and set to evaluation mode
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

# Define the conversation content: first line is user input, 
# second line is assistant's placeholder
conversation = [
    {
        "role": "<|User|>",
        "content": "A stunning princess from kabul in red, "
                    "white traditional clothing, blue eyes, brown hair",
    },
    {"role": "<|Assistant|>", "content": ""},
]

# Apply the SFT (Supervised Fine-Tuning) template on conversation to 
# generate a training format prompt
sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
    conversations=conversation,
    sft_format=vl_chat_processor.sft_format,
    system_prompt="",
)
# Append the image start tag after the prompt
prompt = sft_format + vl_chat_processor.image_start_tag

# Define the image generation function, 
# using torch.inference_mode to ensure no gradient calculation
@torch.inference_mode()
def generate(
    mmgpt: MultiModalityCausalLM,
    vl_chat_processor: VLChatProcessor,
    prompt: str,
    temperature: float = 1,
    parallel_size: int = 1,
    cfg_weight: float = 5,
    image_token_num_per_image: int = 576,
    img_size: int = 384,
    patch_size: int = 16,
):
    # Encode prompt into token IDs
    input_ids = vl_chat_processor.tokenizer.encode(prompt)
    input_ids = torch.LongTensor(input_ids)

    # Create a tokens tensor with length parallel_size*2, repeat input_ids
    tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int).cuda()
    for i in range(parallel_size*2):
        tokens[i, :] = input_ids
        # Keep original prompt in even rows, 
        # pad with pad_id in odd rows for contrastive guidance
        if i % 2 != 0:
            tokens[i, 1:-1] = vl_chat_processor.pad_id

    # Convert tokens to embeddings
    inputs_embeds = mmgpt.language_model.get_input_embeddings()(tokens)

    # Prepare tensor to store generated image tokens
    generated_tokens = torch.zeros(
        (parallel_size, image_token_num_per_image), 
        dtype=torch.int
    ).cuda()

    # Generate each image token step by step
    for i in range(image_token_num_per_image):
        # Call the model and pass past_key_values to speed up generation
        outputs = mmgpt.language_model.model(
            inputs_embeds=inputs_embeds,
            use_cache=True,
            past_key_values=outputs.past_key_values if i != 0 else None
        )
        hidden_states = outputs.last_hidden_state
        
        # Get logits split into conditional (cond) and unconditional (uncond) parts
        logits = mmgpt.gen_head(hidden_states[:, -1, :])
        logit_cond = logits[0::2, :]
        logit_uncond = logits[1::2, :]
        
        # Apply classifier-free guidance (CFG) weighting
        logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
        probs = torch.softmax(logits / temperature, dim=-1)

        # Sample the next token according to probabilities
        next_token = torch.multinomial(probs, num_samples=1)
        generated_tokens[:, i] = next_token.squeeze(dim=-1)

        # Prepare input embeddings for next step; 
        # replicate tokens and convert to image embeddings
        next_token = torch.cat(
            [next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1
        ).view(-1)
        img_embeds = mmgpt.prepare_gen_img_embeds(next_token)
        inputs_embeds = img_embeds.unsqueeze(dim=1)

    # Use the vision decoder to decode tokens into images
    dec = mmgpt.gen_vision_model.decode_code(
        generated_tokens.to(dtype=torch.int),
        shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size]
    )
    # Convert values to uint8 image format
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
    dec = np.clip((dec + 1) / 2 * 255, 0, 255)

    # Create blank array and fill with final images
    visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
    visual_img[:, :, :] = dec

    # Create output folder and save images
    os.makedirs('generated_samples', exist_ok=True)
    for i in range(parallel_size):
        save_path = os.path.join('generated_samples', f"img_{i}.jpg")
        PIL.Image.fromarray(visual_img[i]).save(save_path)

# Call the generate function to start image generation
generate(
    vl_gpt,
    vl_chat_processor,
    prompt,
)

 

7. After running it, you will see a response similar to the following:

Once you're done, you can modify the text input in the program to generate various images using Janus!

Reference:

[deepseek-ai/Janus-Pro-1B · Hugging Face](https://huggingface.co/deepseek-ai/Janus-Pro-1B)

[deepseek-ai/Janus: Janus-Series: Unified Multimodal Understanding and Generation Models](https://github.com/deepseek-ai/Janus)

 

 

Copyright © 2026 YUAN High-Tech Development Co., Ltd.
All rights reserved.